Commit 8afa2bc6 8afa2bc6e3feca80a55c24035b2bfb257d2310ff by wenxin

代码优化

1 parent 7acfaa5f
...@@ -97,10 +97,10 @@ def get_dates() -> Tuple[str, str]: ...@@ -97,10 +97,10 @@ def get_dates() -> Tuple[str, str]:
97 :return: Tuple[str, str] 97 :return: Tuple[str, str]
98 """ 98 """
99 today = datetime.today() 99 today = datetime.today()
100 yesterday = today - timedelta(days=1) 100 start = today - timedelta(days=4)
101 today_str = today.strftime("%Y-%m-%d") 101 end = today.strftime("%Y-%m-%d")
102 yesterday_str = yesterday.strftime("%Y-%m-%d") 102 start_str = start.strftime("%Y-%m-%d")
103 return today_str, yesterday_str 103 return end, start_str
104 104
105 105
106 if __name__ == "__main__": 106 if __name__ == "__main__":
......
...@@ -9,6 +9,7 @@ from app.schemas.config_schema import UrlTemplateInfo ...@@ -9,6 +9,7 @@ from app.schemas.config_schema import UrlTemplateInfo
9 from app.schemas.spider_schema import SpiderParams, TypeEnum, GuoDuoSpiderResult, ResponseModel 9 from app.schemas.spider_schema import SpiderParams, TypeEnum, GuoDuoSpiderResult, ResponseModel
10 from app.config.app_config import getAppConfig 10 from app.config.app_config import getAppConfig
11 from superstream import Stream 11 from superstream import Stream
12 from tqdm.asyncio import tqdm_asyncio
12 13
13 logger = logging.getLogger(__name__) 14 logger = logging.getLogger(__name__)
14 15
...@@ -28,28 +29,25 @@ def build_request_url(spiderParam: SpiderParams) -> Dict[TypeEnum, List[Tuple[st ...@@ -28,28 +29,25 @@ def build_request_url(spiderParam: SpiderParams) -> Dict[TypeEnum, List[Tuple[st
28 return target_urls 29 return target_urls
29 30
30 31
31 @retry(
32 stop=stop_after_attempt(3),
33 before_sleep=before_sleep_log(logger, logging.DEBUG),
34 wait=wait_exponential(multiplier=1, min=1, max=10),
35 after=after_log(logger, logging.DEBUG),
36 )
37 async def fetch(session, url): 32 async def fetch(session, url):
38 async with session.get(url) as response: 33 async with session.get(url) as response:
34 if response.status != 200:
35 logger.error(f"请求失败,状态码为:{response.status}")
36 else:
39 return await response.json() 37 return await response.json()
40 38
41 39
42 async def fetch_and_parse(urls: List[Tuple[str, str]]) -> Dict[str, ResponseModel]: 40 async def fetch_and_parse(urls: List[Tuple[str, str]]) -> Dict[str, ResponseModel]:
43 async with aiohttp.ClientSession() as session: 41 async with aiohttp.ClientSession() as session:
44 requests = [fetch(session, url[0]) for url in urls] 42 requests = [fetch(session, url[0]) for url in urls]
45 results = await asyncio.gather(*requests) 43 results = await tqdm_asyncio.gather(*requests)
46 return_data = {} 44 return_data = {}
47 for k, v in zip(urls, results): 45 for k, v in zip(urls, results):
48 return_data[k[1]] = ResponseModel(**v) 46 return_data[k[1]] = ResponseModel(**v)
49 return return_data 47 return return_data
50 48
51 49
52 def parse_response_to_spider_result(guduo_responses: Dict[str, ResponseModel], dataType: TypeEnum) -> List[GuoDuoSpiderResult]: 50 def convert_results(guduo_responses: Dict[str, ResponseModel], dataType: TypeEnum) -> List[GuoDuoSpiderResult]:
53 results: List[GuoDuoSpiderResult] = [] 51 results: List[GuoDuoSpiderResult] = []
54 for k, v in guduo_responses.items(): 52 for k, v in guduo_responses.items():
55 time_request = k 53 time_request = k
...@@ -62,24 +60,33 @@ def parse_response_to_spider_result(guduo_responses: Dict[str, ResponseModel], d ...@@ -62,24 +60,33 @@ def parse_response_to_spider_result(guduo_responses: Dict[str, ResponseModel], d
62 results.append(GuoDuoSpiderResult(time=time_request, targetType=targetType, 60 results.append(GuoDuoSpiderResult(time=time_request, targetType=targetType,
63 platform=platform, sourceName=sourceName, 61 platform=platform, sourceName=sourceName,
64 score=score, createTime=createTime)) 62 score=score, createTime=createTime))
63 logger.info(f"处理类型是{dataType}的数据,时间是{time_request},获取结果长度为{len(v.data)}")
65 return results 64 return results
66 65
67 66
68 async def batch_fetch_and_parse(urls: List[Tuple[str, str]], enum: TypeEnum): 67 async def batch_fetch_and_parse(urls: List[Tuple[str, str]], enum: TypeEnum):
69 result = await fetch_and_parse(urls) 68 result = await fetch_and_parse(urls)
70 return parse_response_to_spider_result(result, enum) 69 return convert_results(result, enum)
71 70
72 71
72 @retry(
73 stop=stop_after_attempt(3),
74 before_sleep=before_sleep_log(logger, logging.DEBUG),
75 wait=wait_exponential(multiplier=1, min=1, max=10),
76 after=after_log(logger, logging.DEBUG),
77 )
73 async def get_score_data(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: 78 async def get_score_data(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]:
74 """ 79 """
75 获取评分数据 80 获取评分数据
76 :param spiderParam: 爬取参数 81 :param spiderParam: 爬取参数
77 :return: 82 :return:
78 """ 83 """
84 logger.info(f"开始爬取数据,爬取参数为:{spiderParam}")
79 url_infos = build_request_url(spiderParam) 85 url_infos = build_request_url(spiderParam)
80 tasks = [batch_fetch_and_parse(urls, key) for key, urls in url_infos.items()] 86 tasks = [batch_fetch_and_parse(urls, key) for key, urls in url_infos.items()]
81 results = await asyncio.gather(*tasks) 87 results = await tqdm_asyncio.gather(*tasks)
82 spider_datas = [item for sublist in results for item in sublist] 88 spider_datas = [item for sublist in results for item in sublist]
89 logger.info(f"爬取数据结束,共爬取{len(spider_datas)}条数据")
83 return spider_datas 90 return spider_datas
84 91
85 92
...@@ -97,6 +104,7 @@ if __name__ == '__main__': ...@@ -97,6 +104,7 @@ if __name__ == '__main__':
97 ], 104 ],
98 url="http://guduodata.com", 105 url="http://guduodata.com",
99 ) 106 )
107 print(param.get_time_range())
100 res = asyncio.run(get_score_data(param)) 108 res = asyncio.run(get_score_data(param))
101 print(f"程序执行耗时时间:{(time.time() - start_time) / 1000}长度为:{len(res)}") 109 print(f"程序执行耗时时间:{(time.time() - start_time) / 1000}长度为:{len(res)}")
102 print(res) 110 print(res)
......