代码优化
Showing
2 changed files
with
22 additions
and
14 deletions
| ... | @@ -97,10 +97,10 @@ def get_dates() -> Tuple[str, str]: | ... | @@ -97,10 +97,10 @@ def get_dates() -> Tuple[str, str]: |
| 97 | :return: Tuple[str, str] | 97 | :return: Tuple[str, str] |
| 98 | """ | 98 | """ |
| 99 | today = datetime.today() | 99 | today = datetime.today() |
| 100 | yesterday = today - timedelta(days=1) | 100 | start = today - timedelta(days=4) |
| 101 | today_str = today.strftime("%Y-%m-%d") | 101 | end = today.strftime("%Y-%m-%d") |
| 102 | yesterday_str = yesterday.strftime("%Y-%m-%d") | 102 | start_str = start.strftime("%Y-%m-%d") |
| 103 | return today_str, yesterday_str | 103 | return end, start_str |
| 104 | 104 | ||
| 105 | 105 | ||
| 106 | if __name__ == "__main__": | 106 | if __name__ == "__main__": | ... | ... |
| ... | @@ -9,6 +9,7 @@ from app.schemas.config_schema import UrlTemplateInfo | ... | @@ -9,6 +9,7 @@ from app.schemas.config_schema import UrlTemplateInfo |
| 9 | from app.schemas.spider_schema import SpiderParams, TypeEnum, GuoDuoSpiderResult, ResponseModel | 9 | from app.schemas.spider_schema import SpiderParams, TypeEnum, GuoDuoSpiderResult, ResponseModel |
| 10 | from app.config.app_config import getAppConfig | 10 | from app.config.app_config import getAppConfig |
| 11 | from superstream import Stream | 11 | from superstream import Stream |
| 12 | from tqdm.asyncio import tqdm_asyncio | ||
| 12 | 13 | ||
| 13 | logger = logging.getLogger(__name__) | 14 | logger = logging.getLogger(__name__) |
| 14 | 15 | ||
| ... | @@ -28,28 +29,25 @@ def build_request_url(spiderParam: SpiderParams) -> Dict[TypeEnum, List[Tuple[st | ... | @@ -28,28 +29,25 @@ def build_request_url(spiderParam: SpiderParams) -> Dict[TypeEnum, List[Tuple[st |
| 28 | return target_urls | 29 | return target_urls |
| 29 | 30 | ||
| 30 | 31 | ||
| 31 | @retry( | ||
| 32 | stop=stop_after_attempt(3), | ||
| 33 | before_sleep=before_sleep_log(logger, logging.DEBUG), | ||
| 34 | wait=wait_exponential(multiplier=1, min=1, max=10), | ||
| 35 | after=after_log(logger, logging.DEBUG), | ||
| 36 | ) | ||
| 37 | async def fetch(session, url): | 32 | async def fetch(session, url): |
| 38 | async with session.get(url) as response: | 33 | async with session.get(url) as response: |
| 34 | if response.status != 200: | ||
| 35 | logger.error(f"请求失败,状态码为:{response.status}") | ||
| 36 | else: | ||
| 39 | return await response.json() | 37 | return await response.json() |
| 40 | 38 | ||
| 41 | 39 | ||
| 42 | async def fetch_and_parse(urls: List[Tuple[str, str]]) -> Dict[str, ResponseModel]: | 40 | async def fetch_and_parse(urls: List[Tuple[str, str]]) -> Dict[str, ResponseModel]: |
| 43 | async with aiohttp.ClientSession() as session: | 41 | async with aiohttp.ClientSession() as session: |
| 44 | requests = [fetch(session, url[0]) for url in urls] | 42 | requests = [fetch(session, url[0]) for url in urls] |
| 45 | results = await asyncio.gather(*requests) | 43 | results = await tqdm_asyncio.gather(*requests) |
| 46 | return_data = {} | 44 | return_data = {} |
| 47 | for k, v in zip(urls, results): | 45 | for k, v in zip(urls, results): |
| 48 | return_data[k[1]] = ResponseModel(**v) | 46 | return_data[k[1]] = ResponseModel(**v) |
| 49 | return return_data | 47 | return return_data |
| 50 | 48 | ||
| 51 | 49 | ||
| 52 | def parse_response_to_spider_result(guduo_responses: Dict[str, ResponseModel], dataType: TypeEnum) -> List[GuoDuoSpiderResult]: | 50 | def convert_results(guduo_responses: Dict[str, ResponseModel], dataType: TypeEnum) -> List[GuoDuoSpiderResult]: |
| 53 | results: List[GuoDuoSpiderResult] = [] | 51 | results: List[GuoDuoSpiderResult] = [] |
| 54 | for k, v in guduo_responses.items(): | 52 | for k, v in guduo_responses.items(): |
| 55 | time_request = k | 53 | time_request = k |
| ... | @@ -62,24 +60,33 @@ def parse_response_to_spider_result(guduo_responses: Dict[str, ResponseModel], d | ... | @@ -62,24 +60,33 @@ def parse_response_to_spider_result(guduo_responses: Dict[str, ResponseModel], d |
| 62 | results.append(GuoDuoSpiderResult(time=time_request, targetType=targetType, | 60 | results.append(GuoDuoSpiderResult(time=time_request, targetType=targetType, |
| 63 | platform=platform, sourceName=sourceName, | 61 | platform=platform, sourceName=sourceName, |
| 64 | score=score, createTime=createTime)) | 62 | score=score, createTime=createTime)) |
| 63 | logger.info(f"处理类型是{dataType}的数据,时间是{time_request},获取结果长度为{len(v.data)}") | ||
| 65 | return results | 64 | return results |
| 66 | 65 | ||
| 67 | 66 | ||
| 68 | async def batch_fetch_and_parse(urls: List[Tuple[str, str]], enum: TypeEnum): | 67 | async def batch_fetch_and_parse(urls: List[Tuple[str, str]], enum: TypeEnum): |
| 69 | result = await fetch_and_parse(urls) | 68 | result = await fetch_and_parse(urls) |
| 70 | return parse_response_to_spider_result(result, enum) | 69 | return convert_results(result, enum) |
| 71 | 70 | ||
| 72 | 71 | ||
| 72 | @retry( | ||
| 73 | stop=stop_after_attempt(3), | ||
| 74 | before_sleep=before_sleep_log(logger, logging.DEBUG), | ||
| 75 | wait=wait_exponential(multiplier=1, min=1, max=10), | ||
| 76 | after=after_log(logger, logging.DEBUG), | ||
| 77 | ) | ||
| 73 | async def get_score_data(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: | 78 | async def get_score_data(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: |
| 74 | """ | 79 | """ |
| 75 | 获取评分数据 | 80 | 获取评分数据 |
| 76 | :param spiderParam: 爬取参数 | 81 | :param spiderParam: 爬取参数 |
| 77 | :return: | 82 | :return: |
| 78 | """ | 83 | """ |
| 84 | logger.info(f"开始爬取数据,爬取参数为:{spiderParam}") | ||
| 79 | url_infos = build_request_url(spiderParam) | 85 | url_infos = build_request_url(spiderParam) |
| 80 | tasks = [batch_fetch_and_parse(urls, key) for key, urls in url_infos.items()] | 86 | tasks = [batch_fetch_and_parse(urls, key) for key, urls in url_infos.items()] |
| 81 | results = await asyncio.gather(*tasks) | 87 | results = await tqdm_asyncio.gather(*tasks) |
| 82 | spider_datas = [item for sublist in results for item in sublist] | 88 | spider_datas = [item for sublist in results for item in sublist] |
| 89 | logger.info(f"爬取数据结束,共爬取{len(spider_datas)}条数据") | ||
| 83 | return spider_datas | 90 | return spider_datas |
| 84 | 91 | ||
| 85 | 92 | ||
| ... | @@ -97,6 +104,7 @@ if __name__ == '__main__': | ... | @@ -97,6 +104,7 @@ if __name__ == '__main__': |
| 97 | ], | 104 | ], |
| 98 | url="http://guduodata.com", | 105 | url="http://guduodata.com", |
| 99 | ) | 106 | ) |
| 107 | print(param.get_time_range()) | ||
| 100 | res = asyncio.run(get_score_data(param)) | 108 | res = asyncio.run(get_score_data(param)) |
| 101 | print(f"程序执行耗时时间:{(time.time() - start_time) / 1000}长度为:{len(res)}") | 109 | print(f"程序执行耗时时间:{(time.time() - start_time) / 1000}长度为:{len(res)}") |
| 102 | print(res) | 110 | print(res) | ... | ... |
-
Please register or sign in to post a comment