Commit 7acfaa5f 7acfaa5f6198ac74808ec267b3923afd065d4b78 by wenxin

从接口获取数据

1 parent 5e498700
...@@ -3,5 +3,5 @@ ...@@ -3,5 +3,5 @@
3 <component name="Black"> 3 <component name="Black">
4 <option name="sdkName" value="Spider" /> 4 <option name="sdkName" value="Spider" />
5 </component> 5 </component>
6 <component name="ProjectRootManager" version="2" project-jdk-name="spider" project-jdk-type="Python SDK" /> 6 <component name="ProjectRootManager" version="2" project-jdk-name="Spider" project-jdk-type="Python SDK" />
7 </project> 7 </project>
...\ No newline at end of file ...\ No newline at end of file
......
...@@ -5,16 +5,15 @@ tortoise: ...@@ -5,16 +5,15 @@ tortoise:
5 default: 5 default:
6 engine: tortoise.backends.mysql 6 engine: tortoise.backends.mysql
7 credentials: 7 credentials:
8 database: ams_test 8 database: spider
9 host: 172.0.31.57 9 host: 192.168.1.91
10 password: Tjlh@2017 10 password: Tjlh@2023
11 port: 3306 11 port: 3306
12 user: root 12 user: root
13 minsize: 10 13 minsize: 10
14 maxsize: 200 14 maxsize: 200
15 connect_timeout: 30 15 connect_timeout: 30
16 echo: True 16 echo: True
17 timezone: Asia/Shanghai
18 apps: 17 apps:
19 models_read: 18 models_read:
20 models: 19 models:
...@@ -24,4 +23,14 @@ tortoise: ...@@ -24,4 +23,14 @@ tortoise:
24 23
25 getSpider: 24 getSpider:
26 # 设置设置爬虫定时任务时间间隔 单位是分钟 25 # 设置设置爬虫定时任务时间间隔 单位是分钟
27 interval: 5 26 interval: 1
27 urlTemplate:
28 # 1 电视剧 2 电影 3 综艺 4 动漫
29 - type: 1
30 template: "/m/v3/billboard/list?type=DAILY&category=ALL_ANIME&date={date}&attach=gdi&orderTitle=gdi&platformId=0"
31 - type: 2
32 template: "/m/v3/billboard/list?type=DAILY&category=NETWORK_MOVIE&date={date}&attach=gdi&orderTitle=gdi&platformId=0"
33 - type: 3
34 template: "/m/v3/billboard/list?type=DAILY&category=NETWORK_VARIETY&date={date}&attach=gdi&orderTitle=gdi&platformId=0"
35 - type: 4
36 template: "/m/v3/billboard/list?type=DAILY&category=ALL_ANIME&date={date}&attach=gdi&orderTitle=gdi&platformId=0"
......
...@@ -8,8 +8,9 @@ import logging ...@@ -8,8 +8,9 @@ import logging
8 8
9 logger = logging.getLogger(__name__) 9 logger = logging.getLogger(__name__)
10 10
11
11 @functools.lru_cache() 12 @functools.lru_cache()
12 def getAppConfig()->AppConfig: 13 def getAppConfig() -> AppConfig:
13 logger.info('开始加载AppConfig') 14 logger.info('开始加载AppConfig')
14 # 获取当前文件的绝对路径 15 # 获取当前文件的绝对路径
15 current_file_path = os.path.abspath(__file__) 16 current_file_path = os.path.abspath(__file__)
......
1 from datetime import datetime, timedelta
2 import hashlib 1 import hashlib
3 import logging 2 import logging
3 from datetime import datetime, timedelta
4 from typing import Dict, List, Tuple 4 from typing import Dict, List, Tuple
5 from app.model.mysql_model import XWebCrawler 5
6 from apscheduler.schedulers.asyncio import AsyncIOScheduler 6 from apscheduler.schedulers.asyncio import AsyncIOScheduler
7 from superstream import Stream 7 from superstream import Stream
8
8 from app.config.app_config import getAppConfig 9 from app.config.app_config import getAppConfig
10 from app.model.mysql_model import XWebCrawler
9 from app.schemas.spider_schema import ApschedulerJob, SpiderParams, TaskInfo, TypeEnum 11 from app.schemas.spider_schema import ApschedulerJob, SpiderParams, TaskInfo, TypeEnum
10 12
11 logger = logging.getLogger(__name__) 13 logger = logging.getLogger(__name__)
...@@ -19,7 +21,6 @@ scheduler = AsyncIOScheduler() ...@@ -19,7 +21,6 @@ scheduler = AsyncIOScheduler()
19 @scheduler.scheduled_job("interval", minutes=getAppConfig().getSpider.get("interval")) 21 @scheduler.scheduled_job("interval", minutes=getAppConfig().getSpider.get("interval"))
20 async def get_spider_config(): 22 async def get_spider_config():
21 from app.service.spider_job_service import ( 23 from app.service.spider_job_service import (
22 add_job,
23 add_guduo_job, 24 add_guduo_job,
24 delete_job, 25 delete_job,
25 get_job_info, 26 get_job_info,
...@@ -40,9 +41,9 @@ async def get_spider_config(): ...@@ -40,9 +41,9 @@ async def get_spider_config():
40 # 获取已经注册的任务 41 # 获取已经注册的任务
41 jobs: List[ApschedulerJob] = get_job_info() 42 jobs: List[ApschedulerJob] = get_job_info()
42 # 处理已经关闭的任务 43 # 处理已经关闭的任务
43 regist_jobs: Dict[int, ApschedulerJob] = Stream(jobs).group_by(lambda x: x.taskId) 44 registered_jobs: Dict[int, ApschedulerJob] = Stream(jobs).to_dict(lambda x: x.taskId, lambda y: y)
44 for job in status_group.get(0, []): 45 for job in status_group.get(0, []):
45 if job.id in regist_jobs: 46 if job.id in registered_jobs:
46 delete_job(job.id) 47 delete_job(job.id)
47 logger.info(f"删除任务成功,任务id:{job.id}") 48 logger.info(f"删除任务成功,任务id:{job.id}")
48 # 处理新的定时任务 49 # 处理新的定时任务
...@@ -66,7 +67,7 @@ async def get_spider_config(): ...@@ -66,7 +67,7 @@ async def get_spider_config():
66 url=job.target_url, 67 url=job.target_url,
67 taskMd5=md5, 68 taskMd5=md5,
68 ) 69 )
69 if job.id not in regist_jobs: 70 if job.id not in registered_jobs:
70 add_guduo_job(param, taskInfo) 71 add_guduo_job(param, taskInfo)
71 logger.info(f"添加任务成功,任务id:{job.id}") 72 logger.info(f"添加任务成功,任务id:{job.id}")
72 else: 73 else:
......
...@@ -15,7 +15,7 @@ handlers: ...@@ -15,7 +15,7 @@ handlers:
15 class: logging.handlers.TimedRotatingFileHandler 15 class: logging.handlers.TimedRotatingFileHandler
16 level: INFO 16 level: INFO
17 formatter: standard 17 formatter: standard
18 filename: 'logs/app.log' # 日志文件的路径和名称 18 filename: '/Users/mac/PycharmProjects/pythonProject/FastAPIProject1/app/logs/app.log' # 日志文件的路径和名称
19 when: midnight # 按天分割 19 when: midnight # 按天分割
20 interval: 1 # 每天分割一次 20 interval: 1 # 每天分割一次
21 backupCount: 7 # 保留最近7天的日志文件 21 backupCount: 7 # 保留最近7天的日志文件
......
This diff could not be displayed because it is too large.
...@@ -46,6 +46,12 @@ class Port(BaseModel): ...@@ -46,6 +46,12 @@ class Port(BaseModel):
46 port: int 46 port: int
47 47
48 48
49 class UrlTemplateInfo(BaseModel):
50 type: int
51 template: str
52
53
49 class AppConfig(BaseModel): 54 class AppConfig(BaseModel):
50 server: Port 55 server: Port
51 getSpider: Dict[str, int] 56 getSpider: Dict[str, int]
57 urlTemplate: List[UrlTemplateInfo]
......
1 import threading
2 from typing import TypeVar, List, Dict, Generic
3
4 K = TypeVar('K') # 定义类型变量
5
6 V = TypeVar('V')
7
8
9 class SafeDict(Generic[K, V]):
10
11 def __init__(self):
12 self.lock = threading.Lock()
13 self.data: Dict[K, V] = {}
14
15 def put(self, key: K, value: V):
16 with self.lock:
17 self.data.update({key: value})
18
19 def remove(self, key: K):
20 with self.lock:
21 self.data.pop(key)
22
23 def values(self) -> List[V]:
24 with self.lock:
25 return list(self.data.values())
26
27 def get(self, key: K) -> V:
28 with self.lock:
29 return self.data.get(key)
30
31
32
33
34 if __name__ == '__main__':
35 test: SafeDict[int, str] = SafeDict()
36 test.put(1, '1')
37 print(test.get(1))
38 print(test.get(2))
39 t2 = {}
40 print(t2.get(1))
1 from datetime import timedelta, datetime 1 from datetime import timedelta, datetime
2 from enum import IntEnum 2 from enum import IntEnum
3 from typing import List 3 from typing import List, Optional
4 4
5 from apscheduler.job import Job 5 from apscheduler.job import Job
6 from pydantic import BaseModel, Field 6 from pydantic import BaseModel, Field
...@@ -17,19 +17,22 @@ class TypeEnum(IntEnum): ...@@ -17,19 +17,22 @@ class TypeEnum(IntEnum):
17 17
18 def get_precise_positioning(self, other) -> List[str]: 18 def get_precise_positioning(self, other) -> List[str]:
19 if self.ANIME == other: 19 if self.ANIME == other:
20 return [".cateIcon_5", "li.anime-platform:nth-child(1)"] 20 return ["html body div#app.wrap div.content.index div.content-box div.category.category-nav ul li.active",
21 "li.anime-platform:nth-child(1)"]
21 if self.MOVIE == other: 22 if self.MOVIE == other:
22 return [".cateIcon_3", "ul.type-box:nth-child(1) > li:nth-child(1)"] 23 return ["li.active:nth-child(3)",
24 "ul.type-box:nth-child(1) > li:nth-child(1)"]
23 if self.EPISODES == other: 25 if self.EPISODES == other:
24 return [ 26 return [
25 ".cateIcon_1", 27 "div.category:nth-child(3) > ul:nth-child(1) > li:nth-child(1)",
26 "div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)", 28 "div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)",
27 "ul.type-box:nth-child(1) > li:nth-child(1)", 29 "ul.type-box:nth-child(1) > li:nth-child(1)",
28 ] 30 ]
29 if self.VARIETY == other: 31 if self.VARIETY == other:
30 return [ 32 return [
33 "div.category:nth-child(3) > ul:nth-child(1) > li:nth-child(2)",
31 "div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)", 34 "div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)",
32 "ul.type-box:nth-child(1) > li:nth-child(1)", 35 "ul.type-box:nth-child(1) > li:nth-child(1)"
33 ] 36 ]
34 37
35 38
...@@ -96,3 +99,57 @@ class TaskInfo(BaseModel): ...@@ -96,3 +99,57 @@ class TaskInfo(BaseModel):
96 taskId: int 99 taskId: int
97 url: str 100 url: str
98 taskMd5: str 101 taskMd5: str
102
103
104 class PlatformImage(BaseModel):
105 url: str
106
107
108 class DataItem(BaseModel):
109 showId: int | None
110 name: str
111 category: str | None
112 releaseDate: int | None
113 days: int | None
114 offlineDate: int | None
115 releaseStatus: int | None
116 episode: int | None
117 gdi: str | None
118 gdiFloat: float
119 attachData: str
120 rank: int | None
121 rise: int | None
122 platformImgs: List[str] | None
123 topHotCount: Optional[int] = None
124 topHotCountDay: Optional[int] = None
125 totalPlayCount: int | None
126
127
128 class ResponseModel(BaseModel):
129 code: int
130 msg: str
131 data: List[DataItem]
132 extra: Optional[dict] = None
133
134
135 if __name__ == "__main__":
136 from datetime import datetime
137
138 # 时间戳(以毫秒为单位)
139 release_date_timestamp = 1573488000000
140 offline_date_timestamp = 1576080000000
141
142
143 # 将时间戳转换为datetime对象,并转换为'yyyy-MM-dd'格式
144 def timestamp_to_date_string(timestamp_ms):
145 # Convert from milliseconds to seconds by dividing by 1000, then use fromtimestamp
146 dt_object = datetime.fromtimestamp(timestamp_ms / 1000)
147 return dt_object.strftime('%Y-%m-%d')
148
149
150 # 调用函数并打印结果
151 formatted_release_date = timestamp_to_date_string(release_date_timestamp)
152 formatted_offline_date = timestamp_to_date_string(offline_date_timestamp)
153
154 print(f"Release Date: {formatted_release_date}")
155 print(f"Offline Date: {formatted_offline_date}")
......
1 import asyncio 1 import asyncio
2 from datetime import datetime
3 import logging 2 import logging
4 from typing import List, Any 3 from datetime import datetime
5 from app.model.mysql_model import SpiderModel 4 from multiprocessing import Manager
6 from superstream import Stream 5 from typing import List, Any, Dict
6 from app.schemas.safe_contrainer import SafeDict
7 from app.job.job import scheduler 7 from app.job.job import scheduler
8 from app.model.mysql_model import SpiderModel
8 from app.schemas.spider_schema import ( 9 from app.schemas.spider_schema import (
9 ApschedulerJob, 10 ApschedulerJob,
10 TaskInfo, 11 TaskInfo,
11 SpiderParams, 12 SpiderParams,
12 GuoDuoSpiderResult, 13 GuoDuoSpiderResult,
13 ) 14 )
14 from app.spider.guduo_spider import startBrowser 15 from app.spider.http_spider import get_score_data
15 16
16 logger = logging.getLogger(__name__) 17 logger = logging.getLogger(__name__)
17 jobs: List[ApschedulerJob] = [] 18 jobs: SafeDict[int, ApschedulerJob] = SafeDict()
18 19
19 20
20 def add_job(params: List[Any], taskInfo: TaskInfo, task: Any): 21 def add_job(params: List[Any], taskInfo: TaskInfo, task: Any):
...@@ -40,7 +41,9 @@ def add_job(params: List[Any], taskInfo: TaskInfo, task: Any): ...@@ -40,7 +41,9 @@ def add_job(params: List[Any], taskInfo: TaskInfo, task: Any):
40 week=weekend, 41 week=weekend,
41 args=params, 42 args=params,
42 ) 43 )
43 jobs.append(ApschedulerJob(jobId=job, taskId=taskInfo.taskId,taskMd5=taskInfo.md5())) 44 jobs.put(taskInfo.taskId,
45 ApschedulerJob(jobId=job, taskId=taskInfo.taskId, taskMd5=taskInfo.taskMd5)
46 )
44 logger.info( 47 logger.info(
45 f"添加任务成功,任务id:{taskInfo.taskId},任务执行策略:{taskInfo.executeStrategy}" 48 f"添加任务成功,任务id:{taskInfo.taskId},任务执行策略:{taskInfo.executeStrategy}"
46 ) 49 )
...@@ -57,26 +60,21 @@ def add_guduo_job(taskParam: SpiderParams, taskInfo: TaskInfo): ...@@ -57,26 +60,21 @@ def add_guduo_job(taskParam: SpiderParams, taskInfo: TaskInfo):
57 60
58 61
59 def delete_job(taskId: int): 62 def delete_job(taskId: int):
60 for job in jobs: 63 jobs.remove(taskId)
61 if job.taskId == taskId:
62 job.jobId.remove()
63 jobs.remove(job)
64 logger.info(f"删除任务成功,任务id:{taskId}") 64 logger.info(f"删除任务成功,任务id:{taskId}")
65 65
66 66
67 def get_job_info(taskId: int) -> List[ApschedulerJob]: 67 def get_job_info(taskId: int = None) -> List[ApschedulerJob]:
68 if taskId is None: 68 if taskId is None:
69 return jobs 69 return jobs.values()
70 job = Stream(jobs).filter(lambda x: x.taskId == taskId).to_list() 70 return [jobs.get(taskId)] if jobs.get(taskId) else []
71 return f"job 信息->{job}"
72 71
73 72
74 async def scrawl_and_save(taskParam: SpiderParams): 73 async def scrawl_and_save(taskParam: SpiderParams):
75 try: 74 try:
76 # 执行爬虫获取结果 给下面一行代码添加 try cache try 捕获异常 75 # 执行爬虫获取结果 给下面一行代码添加 try cache try 捕获异常
77 results = await startBrowser(taskParam) 76 results = await get_score_data(taskParam)
78 except Exception as e: 77 except Exception as e:
79 logger.info(f"爬虫重试情况:{startBrowser.statistics}")
80 logger.error(f"爬虫任务执行失败,失败原因:{e}") 78 logger.error(f"爬虫任务执行失败,失败原因:{e}")
81 return 79 return
82 asyncTasks = (save_or_update(item) for item in results) 80 asyncTasks = (save_or_update(item) for item in results)
......
...@@ -18,18 +18,12 @@ from tenacity import ( ...@@ -18,18 +18,12 @@ from tenacity import (
18 logger = logging.getLogger(__name__) 18 logger = logging.getLogger(__name__)
19 19
20 20
21 @retry(
22 stop=stop_after_attempt(3),
23 before_sleep=before_sleep_log(logger, logging.DEBUG),
24 wait=wait_exponential(multiplier=1, min=1, max=10),
25 after=after_log(logger, logging.DEBUG),
26 )
27 async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: 21 async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]:
28 time_range = spiderParam.get_time_range() 22 time_range = spiderParam.get_time_range()
29 url = spiderParam.url 23 url = spiderParam.url
30 scrawl_types = spiderParam.target_type 24 scrawl_types = spiderParam.target_type
31 async with async_playwright() as p: 25 async with async_playwright() as p:
32 browser = await p.firefox.launch(headless=True) 26 browser = await p.firefox.launch(headless=False)
33 # 创建指定个数的浏览器页面 27 # 创建指定个数的浏览器页面
34 pages = await tqdm_asyncio.gather( 28 pages = await tqdm_asyncio.gather(
35 *(browser.new_page() for _ in range(len(time_range))) 29 *(browser.new_page() for _ in range(len(time_range)))
...@@ -51,14 +45,14 @@ async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: ...@@ -51,14 +45,14 @@ async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]:
51 45
52 46
53 async def hand_one_data( 47 async def hand_one_data(
54 targetDate: str, page: Page, scrawl_types: List[TypeEnum], url: str 48 targetDate: str, page: Page, scrawl_types: List[TypeEnum], url: str
55 ) -> List[GuoDuoSpiderResult]: 49 ) -> List[GuoDuoSpiderResult]:
56 # 把date这个日期对象解析为年、月、日 50 # 把date这个日期对象解析为年、月、日
57 year, month, day = targetDate.split("-") 51 year, month, day = targetDate.split("-")
58 day = day.replace("0", "") 52 day = day.replace("0", "")
59 month = int(month) - 1 53 month = int(month) - 1
60 await page.goto(url) 54 await page.goto(url)
61 # 等待遮罩层消失 55 # 点击
62 await page.click('//*[@id="currentDateTxt"]') 56 await page.click('//*[@id="currentDateTxt"]')
63 data_css_selector = ( 57 data_css_selector = (
64 f'#day span.pika-button.pika-day[type="button"]' 58 f'#day span.pika-button.pika-day[type="button"]'
...@@ -70,6 +64,10 @@ async def hand_one_data( ...@@ -70,6 +64,10 @@ async def hand_one_data(
70 # 判断指定元素是否存在如果不存在就返回空的[] 64 # 判断指定元素是否存在如果不存在就返回空的[]
71 if not await doc.is_visible(): 65 if not await doc.is_visible():
72 return [] 66 return []
67 # 检查日期是否可点击
68 if await doc.get_attribute('class') == 'pika-button pika-day is-disabled':
69 logger.warning(f"日期 {targetDate} 被禁用,无法点击")
70 return []
73 # 点击指定日期 71 # 点击指定日期
74 await doc.click() 72 await doc.click()
75 # 最后一步修正定位 73 # 最后一步修正定位
...@@ -90,7 +88,7 @@ async def hand_one_data( ...@@ -90,7 +88,7 @@ async def hand_one_data(
90 88
91 89
92 async def get_data( 90 async def get_data(
93 targetPage: Page, scrawl_type: int, target_time: str 91 targetPage: Page, scrawl_type: int, target_time: str
94 ) -> Optional[List[GuoDuoSpiderResult]]: 92 ) -> Optional[List[GuoDuoSpiderResult]]:
95 target_data_css_selector = ".rank-box" 93 target_data_css_selector = ".rank-box"
96 table = targetPage.locator(target_data_css_selector) 94 table = targetPage.locator(target_data_css_selector)
...@@ -122,12 +120,22 @@ async def get_data( ...@@ -122,12 +120,22 @@ async def get_data(
122 return result 120 return result
123 121
124 122
123 @retry(
124 stop=stop_after_attempt(3),
125 before_sleep=before_sleep_log(logger, logging.DEBUG),
126 wait=wait_exponential(multiplier=1, min=1, max=10),
127 after=after_log(logger, logging.DEBUG),
128 )
129 async def retry_job(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]:
130 return await startBrowser(spiderParam)
131
132
125 if __name__ == "__main__": 133 if __name__ == "__main__":
126 # 获取程序开始执行的时间 134 # 获取程序开始执行的时间
127 start_time = time.time() 135 start_time = time.time()
128 param = SpiderParams( 136 param = SpiderParams(
129 startDate="2024-01-02", 137 startDate="2024-12-22",
130 endDate="2024-01-15", 138 endDate="2024-12-23",
131 target_type=[ 139 target_type=[
132 TypeEnum.ANIME, 140 TypeEnum.ANIME,
133 TypeEnum.EPISODES, 141 TypeEnum.EPISODES,
...@@ -136,6 +144,6 @@ if __name__ == "__main__": ...@@ -136,6 +144,6 @@ if __name__ == "__main__":
136 ], 144 ],
137 url="http://guduodata.com/", 145 url="http://guduodata.com/",
138 ) 146 )
139 res = asyncio.run(startBrowser(param)) 147 res = asyncio.run(retry_job(param))
140 print(f"代码重试情况:{startBrowser.statistics}") 148 print(f"代码重试情况:{retry_job.statistics}")
141 print(f"程序执行耗时时间:{(time.time() - start_time)/1000}长度为:{len(res)}") 149 print(f"程序执行耗时时间:{(time.time() - start_time) / 1000}长度为:{len(res)}")
......
1 import asyncio
2 import datetime
3 import time
4 from typing import List, Dict, Tuple
5 import aiohttp
6 from tenacity import retry, stop_after_attempt, before_sleep_log, wait_exponential, after_log
7 import logging
8 from app.schemas.config_schema import UrlTemplateInfo
9 from app.schemas.spider_schema import SpiderParams, TypeEnum, GuoDuoSpiderResult, ResponseModel
10 from app.config.app_config import getAppConfig
11 from superstream import Stream
12
13 logger = logging.getLogger(__name__)
14
15
16 def build_request_url(spiderParam: SpiderParams) -> Dict[TypeEnum, List[Tuple[str, str]]]:
17 base_url = spiderParam.url
18 targets = spiderParam.target_type
19 templates: List[UrlTemplateInfo] = getAppConfig().urlTemplate
20 templates_group: Dict[int, str] = Stream(templates).to_map(lambda x: x.type, lambda x: x.template)
21 # 时间范围中的每个时间
22 time_range = spiderParam.get_time_range()
23 target_urls: Dict[TypeEnum, List[Tuple[str, str]]] = {}
24 for target in targets:
25 template = templates_group[target.value]
26 url_type = [(base_url + template.format(date=item), item) for item in time_range]
27 target_urls[target] = url_type
28 return target_urls
29
30
31 @retry(
32 stop=stop_after_attempt(3),
33 before_sleep=before_sleep_log(logger, logging.DEBUG),
34 wait=wait_exponential(multiplier=1, min=1, max=10),
35 after=after_log(logger, logging.DEBUG),
36 )
37 async def fetch(session, url):
38 async with session.get(url) as response:
39 return await response.json()
40
41
42 async def fetch_and_parse(urls: List[Tuple[str, str]]) -> Dict[str, ResponseModel]:
43 async with aiohttp.ClientSession() as session:
44 requests = [fetch(session, url[0]) for url in urls]
45 results = await asyncio.gather(*requests)
46 return_data = {}
47 for k, v in zip(urls, results):
48 return_data[k[1]] = ResponseModel(**v)
49 return return_data
50
51
52 def parse_response_to_spider_result(guduo_responses: Dict[str, ResponseModel], dataType: TypeEnum) -> List[GuoDuoSpiderResult]:
53 results: List[GuoDuoSpiderResult] = []
54 for k, v in guduo_responses.items():
55 time_request = k
56 targetType = dataType
57 platform = 'ALL'
58 createTime = datetime.datetime.now()
59 for item in v.data:
60 sourceName = item.name
61 score = item.gdiFloat
62 results.append(GuoDuoSpiderResult(time=time_request, targetType=targetType,
63 platform=platform, sourceName=sourceName,
64 score=score, createTime=createTime))
65 return results
66
67
68 async def batch_fetch_and_parse(urls: List[Tuple[str, str]], enum: TypeEnum):
69 result = await fetch_and_parse(urls)
70 return parse_response_to_spider_result(result, enum)
71
72
73 async def get_score_data(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]:
74 """
75 获取评分数据
76 :param spiderParam: 爬取参数
77 :return:
78 """
79 url_infos = build_request_url(spiderParam)
80 tasks = [batch_fetch_and_parse(urls, key) for key, urls in url_infos.items()]
81 results = await asyncio.gather(*tasks)
82 spider_datas = [item for sublist in results for item in sublist]
83 return spider_datas
84
85
86 if __name__ == '__main__':
87 # 获取程序开始执行的时间
88 start_time = time.time()
89 param = SpiderParams(
90 startDate="2024-12-22",
91 endDate="2024-12-23",
92 target_type=[
93 TypeEnum.ANIME,
94 TypeEnum.EPISODES,
95 TypeEnum.MOVIE,
96 TypeEnum.VARIETY,
97 ],
98 url="http://guduodata.com",
99 )
100 res = asyncio.run(get_score_data(param))
101 print(f"程序执行耗时时间:{(time.time() - start_time) / 1000}长度为:{len(res)}")
102 print(res)
...@@ -33,3 +33,5 @@ tzdata==2024.2 ...@@ -33,3 +33,5 @@ tzdata==2024.2
33 tzlocal==5.2 33 tzlocal==5.2
34 uvicorn==0.34.0 34 uvicorn==0.34.0
35 virtualenv==20.25.0 35 virtualenv==20.25.0
36
37 aiohttp~=3.11.11
...\ No newline at end of file ...\ No newline at end of file
......