代码优化
Showing
2 changed files
with
23 additions
and
15 deletions
... | @@ -97,10 +97,10 @@ def get_dates() -> Tuple[str, str]: | ... | @@ -97,10 +97,10 @@ def get_dates() -> Tuple[str, str]: |
97 | :return: Tuple[str, str] | 97 | :return: Tuple[str, str] |
98 | """ | 98 | """ |
99 | today = datetime.today() | 99 | today = datetime.today() |
100 | yesterday = today - timedelta(days=1) | 100 | start = today - timedelta(days=4) |
101 | today_str = today.strftime("%Y-%m-%d") | 101 | end = today.strftime("%Y-%m-%d") |
102 | yesterday_str = yesterday.strftime("%Y-%m-%d") | 102 | start_str = start.strftime("%Y-%m-%d") |
103 | return today_str, yesterday_str | 103 | return end, start_str |
104 | 104 | ||
105 | 105 | ||
106 | if __name__ == "__main__": | 106 | if __name__ == "__main__": | ... | ... |
... | @@ -9,6 +9,7 @@ from app.schemas.config_schema import UrlTemplateInfo | ... | @@ -9,6 +9,7 @@ from app.schemas.config_schema import UrlTemplateInfo |
9 | from app.schemas.spider_schema import SpiderParams, TypeEnum, GuoDuoSpiderResult, ResponseModel | 9 | from app.schemas.spider_schema import SpiderParams, TypeEnum, GuoDuoSpiderResult, ResponseModel |
10 | from app.config.app_config import getAppConfig | 10 | from app.config.app_config import getAppConfig |
11 | from superstream import Stream | 11 | from superstream import Stream |
12 | from tqdm.asyncio import tqdm_asyncio | ||
12 | 13 | ||
13 | logger = logging.getLogger(__name__) | 14 | logger = logging.getLogger(__name__) |
14 | 15 | ||
... | @@ -28,28 +29,25 @@ def build_request_url(spiderParam: SpiderParams) -> Dict[TypeEnum, List[Tuple[st | ... | @@ -28,28 +29,25 @@ def build_request_url(spiderParam: SpiderParams) -> Dict[TypeEnum, List[Tuple[st |
28 | return target_urls | 29 | return target_urls |
29 | 30 | ||
30 | 31 | ||
31 | @retry( | ||
32 | stop=stop_after_attempt(3), | ||
33 | before_sleep=before_sleep_log(logger, logging.DEBUG), | ||
34 | wait=wait_exponential(multiplier=1, min=1, max=10), | ||
35 | after=after_log(logger, logging.DEBUG), | ||
36 | ) | ||
37 | async def fetch(session, url): | 32 | async def fetch(session, url): |
38 | async with session.get(url) as response: | 33 | async with session.get(url) as response: |
39 | return await response.json() | 34 | if response.status != 200: |
35 | logger.error(f"请求失败,状态码为:{response.status}") | ||
36 | else: | ||
37 | return await response.json() | ||
40 | 38 | ||
41 | 39 | ||
42 | async def fetch_and_parse(urls: List[Tuple[str, str]]) -> Dict[str, ResponseModel]: | 40 | async def fetch_and_parse(urls: List[Tuple[str, str]]) -> Dict[str, ResponseModel]: |
43 | async with aiohttp.ClientSession() as session: | 41 | async with aiohttp.ClientSession() as session: |
44 | requests = [fetch(session, url[0]) for url in urls] | 42 | requests = [fetch(session, url[0]) for url in urls] |
45 | results = await asyncio.gather(*requests) | 43 | results = await tqdm_asyncio.gather(*requests) |
46 | return_data = {} | 44 | return_data = {} |
47 | for k, v in zip(urls, results): | 45 | for k, v in zip(urls, results): |
48 | return_data[k[1]] = ResponseModel(**v) | 46 | return_data[k[1]] = ResponseModel(**v) |
49 | return return_data | 47 | return return_data |
50 | 48 | ||
51 | 49 | ||
52 | def parse_response_to_spider_result(guduo_responses: Dict[str, ResponseModel], dataType: TypeEnum) -> List[GuoDuoSpiderResult]: | 50 | def convert_results(guduo_responses: Dict[str, ResponseModel], dataType: TypeEnum) -> List[GuoDuoSpiderResult]: |
53 | results: List[GuoDuoSpiderResult] = [] | 51 | results: List[GuoDuoSpiderResult] = [] |
54 | for k, v in guduo_responses.items(): | 52 | for k, v in guduo_responses.items(): |
55 | time_request = k | 53 | time_request = k |
... | @@ -62,24 +60,33 @@ def parse_response_to_spider_result(guduo_responses: Dict[str, ResponseModel], d | ... | @@ -62,24 +60,33 @@ def parse_response_to_spider_result(guduo_responses: Dict[str, ResponseModel], d |
62 | results.append(GuoDuoSpiderResult(time=time_request, targetType=targetType, | 60 | results.append(GuoDuoSpiderResult(time=time_request, targetType=targetType, |
63 | platform=platform, sourceName=sourceName, | 61 | platform=platform, sourceName=sourceName, |
64 | score=score, createTime=createTime)) | 62 | score=score, createTime=createTime)) |
63 | logger.info(f"处理类型是{dataType}的数据,时间是{time_request},获取结果长度为{len(v.data)}") | ||
65 | return results | 64 | return results |
66 | 65 | ||
67 | 66 | ||
68 | async def batch_fetch_and_parse(urls: List[Tuple[str, str]], enum: TypeEnum): | 67 | async def batch_fetch_and_parse(urls: List[Tuple[str, str]], enum: TypeEnum): |
69 | result = await fetch_and_parse(urls) | 68 | result = await fetch_and_parse(urls) |
70 | return parse_response_to_spider_result(result, enum) | 69 | return convert_results(result, enum) |
71 | 70 | ||
72 | 71 | ||
72 | @retry( | ||
73 | stop=stop_after_attempt(3), | ||
74 | before_sleep=before_sleep_log(logger, logging.DEBUG), | ||
75 | wait=wait_exponential(multiplier=1, min=1, max=10), | ||
76 | after=after_log(logger, logging.DEBUG), | ||
77 | ) | ||
73 | async def get_score_data(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: | 78 | async def get_score_data(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: |
74 | """ | 79 | """ |
75 | 获取评分数据 | 80 | 获取评分数据 |
76 | :param spiderParam: 爬取参数 | 81 | :param spiderParam: 爬取参数 |
77 | :return: | 82 | :return: |
78 | """ | 83 | """ |
84 | logger.info(f"开始爬取数据,爬取参数为:{spiderParam}") | ||
79 | url_infos = build_request_url(spiderParam) | 85 | url_infos = build_request_url(spiderParam) |
80 | tasks = [batch_fetch_and_parse(urls, key) for key, urls in url_infos.items()] | 86 | tasks = [batch_fetch_and_parse(urls, key) for key, urls in url_infos.items()] |
81 | results = await asyncio.gather(*tasks) | 87 | results = await tqdm_asyncio.gather(*tasks) |
82 | spider_datas = [item for sublist in results for item in sublist] | 88 | spider_datas = [item for sublist in results for item in sublist] |
89 | logger.info(f"爬取数据结束,共爬取{len(spider_datas)}条数据") | ||
83 | return spider_datas | 90 | return spider_datas |
84 | 91 | ||
85 | 92 | ||
... | @@ -97,6 +104,7 @@ if __name__ == '__main__': | ... | @@ -97,6 +104,7 @@ if __name__ == '__main__': |
97 | ], | 104 | ], |
98 | url="http://guduodata.com", | 105 | url="http://guduodata.com", |
99 | ) | 106 | ) |
107 | print(param.get_time_range()) | ||
100 | res = asyncio.run(get_score_data(param)) | 108 | res = asyncio.run(get_score_data(param)) |
101 | print(f"程序执行耗时时间:{(time.time() - start_time) / 1000}长度为:{len(res)}") | 109 | print(f"程序执行耗时时间:{(time.time() - start_time) / 1000}长度为:{len(res)}") |
102 | print(res) | 110 | print(res) | ... | ... |
-
Please register or sign in to post a comment