get_data.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. # -*- coding:utf-8 -*-
  2. """
  3. Author: BigCat
  4. """
  5. import argparse
  6. import requests
  7. import pandas as pd
  8. from bs4 import BeautifulSoup
  9. from loguru import logger
  10. from config import os, name_path, data_file_name
  11. parser = argparse.ArgumentParser()
  12. parser.add_argument('--name', default="ssq", type=str, help="选择爬取数据: 双色球/大乐透")
  13. args = parser.parse_args()
  14. def get_url(name):
  15. """
  16. :param name: 玩法名称
  17. :return:
  18. """
  19. url = "https://datachart.500.com/{}/history/".format(name)
  20. path = "newinc/history.php?start={}&end="
  21. return url, path
  22. def get_current_number(name):
  23. """ 获取最新一期数字
  24. :return: int
  25. """
  26. url, _ = get_url(name)
  27. r = requests.get("{}{}".format(url, "history.shtml"), verify=False)
  28. r.encoding = "gb2312"
  29. soup = BeautifulSoup(r.text, "lxml")
  30. current_num = soup.find("div", class_="wrap_datachart").find("input", id="end")["value"]
  31. return current_num
  32. def spider(name, start, end, mode):
  33. """ 爬取历史数据
  34. :param name 玩法
  35. :param start 开始一期
  36. :param end 最近一期
  37. :param mode 模式,train:训练模式,predict:预测模式(训练模式会保持文件)
  38. :return:
  39. """
  40. url, path = get_url(name)
  41. url = "{}{}{}".format(url, path.format(start), end)
  42. r = requests.get(url=url, verify=False)
  43. r.encoding = "gb2312"
  44. soup = BeautifulSoup(r.text, "lxml")
  45. trs = soup.find("tbody", attrs={"id": "tdata"}).find_all("tr")
  46. data = []
  47. for tr in trs:
  48. item = dict()
  49. if name == "ssq":
  50. item[u"期数"] = tr.find_all("td")[0].get_text().strip()
  51. for i in range(6):
  52. item[u"红球_{}".format(i+1)] = tr.find_all("td")[i+1].get_text().strip()
  53. item[u"蓝球"] = tr.find_all("td")[7].get_text().strip()
  54. data.append(item)
  55. elif name == "dlt":
  56. item[u"期数"] = tr.find_all("td")[0].get_text().strip()
  57. for i in range(5):
  58. item[u"红球_{}".format(i+1)] = tr.find_all("td")[i+1].get_text().strip()
  59. for j in range(2):
  60. item[u"蓝球_{}".format(j+1)] = tr.find_all("td")[6+j].get_text().strip()
  61. data.append(item)
  62. else:
  63. logger.warning("抱歉,没有找到数据源!")
  64. if mode == "train":
  65. df = pd.DataFrame(data)
  66. df.to_csv("{}{}".format(name_path[name]["path"], data_file_name), encoding="utf-8")
  67. return pd.DataFrame(data)
  68. elif mode == "predict":
  69. return pd.DataFrame(data)
  70. def run(name):
  71. """
  72. :param name: 玩法名称
  73. :return:
  74. """
  75. current_number = get_current_number(name)
  76. logger.info("【{}】最新一期期号:{}".format(name_path[name]["name"], current_number))
  77. logger.info("正在获取【{}】数据。。。".format(name_path[name]["name"]))
  78. if not os.path.exists(name_path[name]["path"]):
  79. os.makedirs(name_path[name]["path"])
  80. data = spider(name, 1, current_number, "train")
  81. if "data" in os.listdir(os.getcwd()):
  82. logger.info("【{}】数据准备就绪,共{}期, 下一步可训练模型...".format(name_path[name]["name"], len(data)))
  83. else:
  84. logger.error("数据文件不存在!")
  85. if __name__ == "__main__":
  86. if not args.name:
  87. raise Exception("玩法名称不能为空!")
  88. else:
  89. run(name=args.name)