从接口获取数据
Showing
13 changed files
with
275 additions
and
51 deletions
... | @@ -3,5 +3,5 @@ | ... | @@ -3,5 +3,5 @@ |
3 | <component name="Black"> | 3 | <component name="Black"> |
4 | <option name="sdkName" value="Spider" /> | 4 | <option name="sdkName" value="Spider" /> |
5 | </component> | 5 | </component> |
6 | <component name="ProjectRootManager" version="2" project-jdk-name="spider" project-jdk-type="Python SDK" /> | 6 | <component name="ProjectRootManager" version="2" project-jdk-name="Spider" project-jdk-type="Python SDK" /> |
7 | </project> | 7 | </project> |
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
... | @@ -5,16 +5,15 @@ tortoise: | ... | @@ -5,16 +5,15 @@ tortoise: |
5 | default: | 5 | default: |
6 | engine: tortoise.backends.mysql | 6 | engine: tortoise.backends.mysql |
7 | credentials: | 7 | credentials: |
8 | database: ams_test | 8 | database: spider |
9 | host: 172.0.31.57 | 9 | host: 192.168.1.91 |
10 | password: Tjlh@2017 | 10 | password: Tjlh@2023 |
11 | port: 3306 | 11 | port: 3306 |
12 | user: root | 12 | user: root |
13 | minsize: 10 | 13 | minsize: 10 |
14 | maxsize: 200 | 14 | maxsize: 200 |
15 | connect_timeout: 30 | 15 | connect_timeout: 30 |
16 | echo: True | 16 | echo: True |
17 | timezone: Asia/Shanghai | ||
18 | apps: | 17 | apps: |
19 | models_read: | 18 | models_read: |
20 | models: | 19 | models: |
... | @@ -24,4 +23,14 @@ tortoise: | ... | @@ -24,4 +23,14 @@ tortoise: |
24 | 23 | ||
25 | getSpider: | 24 | getSpider: |
26 | # 设置设置爬虫定时任务时间间隔 单位是分钟 | 25 | # 设置设置爬虫定时任务时间间隔 单位是分钟 |
27 | interval: 5 | 26 | interval: 1 |
27 | urlTemplate: | ||
28 | # 1 电视剧 2 电影 3 综艺 4 动漫 | ||
29 | - type: 1 | ||
30 | template: "/m/v3/billboard/list?type=DAILY&category=ALL_ANIME&date={date}&attach=gdi&orderTitle=gdi&platformId=0" | ||
31 | - type: 2 | ||
32 | template: "/m/v3/billboard/list?type=DAILY&category=NETWORK_MOVIE&date={date}&attach=gdi&orderTitle=gdi&platformId=0" | ||
33 | - type: 3 | ||
34 | template: "/m/v3/billboard/list?type=DAILY&category=NETWORK_VARIETY&date={date}&attach=gdi&orderTitle=gdi&platformId=0" | ||
35 | - type: 4 | ||
36 | template: "/m/v3/billboard/list?type=DAILY&category=ALL_ANIME&date={date}&attach=gdi&orderTitle=gdi&platformId=0" | ... | ... |
... | @@ -8,8 +8,9 @@ import logging | ... | @@ -8,8 +8,9 @@ import logging |
8 | 8 | ||
9 | logger = logging.getLogger(__name__) | 9 | logger = logging.getLogger(__name__) |
10 | 10 | ||
11 | |||
11 | @functools.lru_cache() | 12 | @functools.lru_cache() |
12 | def getAppConfig()->AppConfig: | 13 | def getAppConfig() -> AppConfig: |
13 | logger.info('开始加载AppConfig') | 14 | logger.info('开始加载AppConfig') |
14 | # 获取当前文件的绝对路径 | 15 | # 获取当前文件的绝对路径 |
15 | current_file_path = os.path.abspath(__file__) | 16 | current_file_path = os.path.abspath(__file__) | ... | ... |
1 | from datetime import datetime, timedelta | ||
2 | import hashlib | 1 | import hashlib |
3 | import logging | 2 | import logging |
3 | from datetime import datetime, timedelta | ||
4 | from typing import Dict, List, Tuple | 4 | from typing import Dict, List, Tuple |
5 | from app.model.mysql_model import XWebCrawler | 5 | |
6 | from apscheduler.schedulers.asyncio import AsyncIOScheduler | 6 | from apscheduler.schedulers.asyncio import AsyncIOScheduler |
7 | from superstream import Stream | 7 | from superstream import Stream |
8 | |||
8 | from app.config.app_config import getAppConfig | 9 | from app.config.app_config import getAppConfig |
10 | from app.model.mysql_model import XWebCrawler | ||
9 | from app.schemas.spider_schema import ApschedulerJob, SpiderParams, TaskInfo, TypeEnum | 11 | from app.schemas.spider_schema import ApschedulerJob, SpiderParams, TaskInfo, TypeEnum |
10 | 12 | ||
11 | logger = logging.getLogger(__name__) | 13 | logger = logging.getLogger(__name__) |
... | @@ -19,7 +21,6 @@ scheduler = AsyncIOScheduler() | ... | @@ -19,7 +21,6 @@ scheduler = AsyncIOScheduler() |
19 | @scheduler.scheduled_job("interval", minutes=getAppConfig().getSpider.get("interval")) | 21 | @scheduler.scheduled_job("interval", minutes=getAppConfig().getSpider.get("interval")) |
20 | async def get_spider_config(): | 22 | async def get_spider_config(): |
21 | from app.service.spider_job_service import ( | 23 | from app.service.spider_job_service import ( |
22 | add_job, | ||
23 | add_guduo_job, | 24 | add_guduo_job, |
24 | delete_job, | 25 | delete_job, |
25 | get_job_info, | 26 | get_job_info, |
... | @@ -40,9 +41,9 @@ async def get_spider_config(): | ... | @@ -40,9 +41,9 @@ async def get_spider_config(): |
40 | # 获取已经注册的任务 | 41 | # 获取已经注册的任务 |
41 | jobs: List[ApschedulerJob] = get_job_info() | 42 | jobs: List[ApschedulerJob] = get_job_info() |
42 | # 处理已经关闭的任务 | 43 | # 处理已经关闭的任务 |
43 | regist_jobs: Dict[int, ApschedulerJob] = Stream(jobs).group_by(lambda x: x.taskId) | 44 | registered_jobs: Dict[int, ApschedulerJob] = Stream(jobs).to_dict(lambda x: x.taskId, lambda y: y) |
44 | for job in status_group.get(0, []): | 45 | for job in status_group.get(0, []): |
45 | if job.id in regist_jobs: | 46 | if job.id in registered_jobs: |
46 | delete_job(job.id) | 47 | delete_job(job.id) |
47 | logger.info(f"删除任务成功,任务id:{job.id}") | 48 | logger.info(f"删除任务成功,任务id:{job.id}") |
48 | # 处理新的定时任务 | 49 | # 处理新的定时任务 |
... | @@ -66,7 +67,7 @@ async def get_spider_config(): | ... | @@ -66,7 +67,7 @@ async def get_spider_config(): |
66 | url=job.target_url, | 67 | url=job.target_url, |
67 | taskMd5=md5, | 68 | taskMd5=md5, |
68 | ) | 69 | ) |
69 | if job.id not in regist_jobs: | 70 | if job.id not in registered_jobs: |
70 | add_guduo_job(param, taskInfo) | 71 | add_guduo_job(param, taskInfo) |
71 | logger.info(f"添加任务成功,任务id:{job.id}") | 72 | logger.info(f"添加任务成功,任务id:{job.id}") |
72 | else: | 73 | else: | ... | ... |
... | @@ -15,7 +15,7 @@ handlers: | ... | @@ -15,7 +15,7 @@ handlers: |
15 | class: logging.handlers.TimedRotatingFileHandler | 15 | class: logging.handlers.TimedRotatingFileHandler |
16 | level: INFO | 16 | level: INFO |
17 | formatter: standard | 17 | formatter: standard |
18 | filename: 'logs/app.log' # 日志文件的路径和名称 | 18 | filename: '/Users/mac/PycharmProjects/pythonProject/FastAPIProject1/app/logs/app.log' # 日志文件的路径和名称 |
19 | when: midnight # 按天分割 | 19 | when: midnight # 按天分割 |
20 | interval: 1 # 每天分割一次 | 20 | interval: 1 # 每天分割一次 |
21 | backupCount: 7 # 保留最近7天的日志文件 | 21 | backupCount: 7 # 保留最近7天的日志文件 | ... | ... |
This diff could not be displayed because it is too large.
... | @@ -46,6 +46,12 @@ class Port(BaseModel): | ... | @@ -46,6 +46,12 @@ class Port(BaseModel): |
46 | port: int | 46 | port: int |
47 | 47 | ||
48 | 48 | ||
49 | class UrlTemplateInfo(BaseModel): | ||
50 | type: int | ||
51 | template: str | ||
52 | |||
53 | |||
49 | class AppConfig(BaseModel): | 54 | class AppConfig(BaseModel): |
50 | server: Port | 55 | server: Port |
51 | getSpider: Dict[str, int] | 56 | getSpider: Dict[str, int] |
57 | urlTemplate: List[UrlTemplateInfo] | ... | ... |
app/schemas/safe_contrainer.py
0 → 100644
1 | import threading | ||
2 | from typing import TypeVar, List, Dict, Generic | ||
3 | |||
4 | K = TypeVar('K') # 定义类型变量 | ||
5 | |||
6 | V = TypeVar('V') | ||
7 | |||
8 | |||
9 | class SafeDict(Generic[K, V]): | ||
10 | |||
11 | def __init__(self): | ||
12 | self.lock = threading.Lock() | ||
13 | self.data: Dict[K, V] = {} | ||
14 | |||
15 | def put(self, key: K, value: V): | ||
16 | with self.lock: | ||
17 | self.data.update({key: value}) | ||
18 | |||
19 | def remove(self, key: K): | ||
20 | with self.lock: | ||
21 | self.data.pop(key) | ||
22 | |||
23 | def values(self) -> List[V]: | ||
24 | with self.lock: | ||
25 | return list(self.data.values()) | ||
26 | |||
27 | def get(self, key: K) -> V: | ||
28 | with self.lock: | ||
29 | return self.data.get(key) | ||
30 | |||
31 | |||
32 | |||
33 | |||
34 | if __name__ == '__main__': | ||
35 | test: SafeDict[int, str] = SafeDict() | ||
36 | test.put(1, '1') | ||
37 | print(test.get(1)) | ||
38 | print(test.get(2)) | ||
39 | t2 = {} | ||
40 | print(t2.get(1)) |
1 | from datetime import timedelta, datetime | 1 | from datetime import timedelta, datetime |
2 | from enum import IntEnum | 2 | from enum import IntEnum |
3 | from typing import List | 3 | from typing import List, Optional |
4 | 4 | ||
5 | from apscheduler.job import Job | 5 | from apscheduler.job import Job |
6 | from pydantic import BaseModel, Field | 6 | from pydantic import BaseModel, Field |
... | @@ -17,19 +17,22 @@ class TypeEnum(IntEnum): | ... | @@ -17,19 +17,22 @@ class TypeEnum(IntEnum): |
17 | 17 | ||
18 | def get_precise_positioning(self, other) -> List[str]: | 18 | def get_precise_positioning(self, other) -> List[str]: |
19 | if self.ANIME == other: | 19 | if self.ANIME == other: |
20 | return [".cateIcon_5", "li.anime-platform:nth-child(1)"] | 20 | return ["html body div#app.wrap div.content.index div.content-box div.category.category-nav ul li.active", |
21 | "li.anime-platform:nth-child(1)"] | ||
21 | if self.MOVIE == other: | 22 | if self.MOVIE == other: |
22 | return [".cateIcon_3", "ul.type-box:nth-child(1) > li:nth-child(1)"] | 23 | return ["li.active:nth-child(3)", |
24 | "ul.type-box:nth-child(1) > li:nth-child(1)"] | ||
23 | if self.EPISODES == other: | 25 | if self.EPISODES == other: |
24 | return [ | 26 | return [ |
25 | ".cateIcon_1", | 27 | "div.category:nth-child(3) > ul:nth-child(1) > li:nth-child(1)", |
26 | "div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)", | 28 | "div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)", |
27 | "ul.type-box:nth-child(1) > li:nth-child(1)", | 29 | "ul.type-box:nth-child(1) > li:nth-child(1)", |
28 | ] | 30 | ] |
29 | if self.VARIETY == other: | 31 | if self.VARIETY == other: |
30 | return [ | 32 | return [ |
33 | "div.category:nth-child(3) > ul:nth-child(1) > li:nth-child(2)", | ||
31 | "div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)", | 34 | "div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)", |
32 | "ul.type-box:nth-child(1) > li:nth-child(1)", | 35 | "ul.type-box:nth-child(1) > li:nth-child(1)" |
33 | ] | 36 | ] |
34 | 37 | ||
35 | 38 | ||
... | @@ -96,3 +99,57 @@ class TaskInfo(BaseModel): | ... | @@ -96,3 +99,57 @@ class TaskInfo(BaseModel): |
96 | taskId: int | 99 | taskId: int |
97 | url: str | 100 | url: str |
98 | taskMd5: str | 101 | taskMd5: str |
102 | |||
103 | |||
104 | class PlatformImage(BaseModel): | ||
105 | url: str | ||
106 | |||
107 | |||
108 | class DataItem(BaseModel): | ||
109 | showId: int | None | ||
110 | name: str | ||
111 | category: str | None | ||
112 | releaseDate: int | None | ||
113 | days: int | None | ||
114 | offlineDate: int | None | ||
115 | releaseStatus: int | None | ||
116 | episode: int | None | ||
117 | gdi: str | None | ||
118 | gdiFloat: float | ||
119 | attachData: str | ||
120 | rank: int | None | ||
121 | rise: int | None | ||
122 | platformImgs: List[str] | None | ||
123 | topHotCount: Optional[int] = None | ||
124 | topHotCountDay: Optional[int] = None | ||
125 | totalPlayCount: int | None | ||
126 | |||
127 | |||
128 | class ResponseModel(BaseModel): | ||
129 | code: int | ||
130 | msg: str | ||
131 | data: List[DataItem] | ||
132 | extra: Optional[dict] = None | ||
133 | |||
134 | |||
135 | if __name__ == "__main__": | ||
136 | from datetime import datetime | ||
137 | |||
138 | # 时间戳(以毫秒为单位) | ||
139 | release_date_timestamp = 1573488000000 | ||
140 | offline_date_timestamp = 1576080000000 | ||
141 | |||
142 | |||
143 | # 将时间戳转换为datetime对象,并转换为'yyyy-MM-dd'格式 | ||
144 | def timestamp_to_date_string(timestamp_ms): | ||
145 | # Convert from milliseconds to seconds by dividing by 1000, then use fromtimestamp | ||
146 | dt_object = datetime.fromtimestamp(timestamp_ms / 1000) | ||
147 | return dt_object.strftime('%Y-%m-%d') | ||
148 | |||
149 | |||
150 | # 调用函数并打印结果 | ||
151 | formatted_release_date = timestamp_to_date_string(release_date_timestamp) | ||
152 | formatted_offline_date = timestamp_to_date_string(offline_date_timestamp) | ||
153 | |||
154 | print(f"Release Date: {formatted_release_date}") | ||
155 | print(f"Offline Date: {formatted_offline_date}") | ... | ... |
1 | import asyncio | 1 | import asyncio |
2 | from datetime import datetime | ||
3 | import logging | 2 | import logging |
4 | from typing import List, Any | 3 | from datetime import datetime |
5 | from app.model.mysql_model import SpiderModel | 4 | from multiprocessing import Manager |
6 | from superstream import Stream | 5 | from typing import List, Any, Dict |
6 | from app.schemas.safe_contrainer import SafeDict | ||
7 | from app.job.job import scheduler | 7 | from app.job.job import scheduler |
8 | from app.model.mysql_model import SpiderModel | ||
8 | from app.schemas.spider_schema import ( | 9 | from app.schemas.spider_schema import ( |
9 | ApschedulerJob, | 10 | ApschedulerJob, |
10 | TaskInfo, | 11 | TaskInfo, |
11 | SpiderParams, | 12 | SpiderParams, |
12 | GuoDuoSpiderResult, | 13 | GuoDuoSpiderResult, |
13 | ) | 14 | ) |
14 | from app.spider.guduo_spider import startBrowser | 15 | from app.spider.http_spider import get_score_data |
15 | 16 | ||
16 | logger = logging.getLogger(__name__) | 17 | logger = logging.getLogger(__name__) |
17 | jobs: List[ApschedulerJob] = [] | 18 | jobs: SafeDict[int, ApschedulerJob] = SafeDict() |
18 | 19 | ||
19 | 20 | ||
20 | def add_job(params: List[Any], taskInfo: TaskInfo, task: Any): | 21 | def add_job(params: List[Any], taskInfo: TaskInfo, task: Any): |
... | @@ -40,7 +41,9 @@ def add_job(params: List[Any], taskInfo: TaskInfo, task: Any): | ... | @@ -40,7 +41,9 @@ def add_job(params: List[Any], taskInfo: TaskInfo, task: Any): |
40 | week=weekend, | 41 | week=weekend, |
41 | args=params, | 42 | args=params, |
42 | ) | 43 | ) |
43 | jobs.append(ApschedulerJob(jobId=job, taskId=taskInfo.taskId,taskMd5=taskInfo.md5())) | 44 | jobs.put(taskInfo.taskId, |
45 | ApschedulerJob(jobId=job, taskId=taskInfo.taskId, taskMd5=taskInfo.taskMd5) | ||
46 | ) | ||
44 | logger.info( | 47 | logger.info( |
45 | f"添加任务成功,任务id:{taskInfo.taskId},任务执行策略:{taskInfo.executeStrategy}" | 48 | f"添加任务成功,任务id:{taskInfo.taskId},任务执行策略:{taskInfo.executeStrategy}" |
46 | ) | 49 | ) |
... | @@ -57,26 +60,21 @@ def add_guduo_job(taskParam: SpiderParams, taskInfo: TaskInfo): | ... | @@ -57,26 +60,21 @@ def add_guduo_job(taskParam: SpiderParams, taskInfo: TaskInfo): |
57 | 60 | ||
58 | 61 | ||
59 | def delete_job(taskId: int): | 62 | def delete_job(taskId: int): |
60 | for job in jobs: | 63 | jobs.remove(taskId) |
61 | if job.taskId == taskId: | ||
62 | job.jobId.remove() | ||
63 | jobs.remove(job) | ||
64 | logger.info(f"删除任务成功,任务id:{taskId}") | 64 | logger.info(f"删除任务成功,任务id:{taskId}") |
65 | 65 | ||
66 | 66 | ||
67 | def get_job_info(taskId: int) -> List[ApschedulerJob]: | 67 | def get_job_info(taskId: int = None) -> List[ApschedulerJob]: |
68 | if taskId is None: | 68 | if taskId is None: |
69 | return jobs | 69 | return jobs.values() |
70 | job = Stream(jobs).filter(lambda x: x.taskId == taskId).to_list() | 70 | return [jobs.get(taskId)] if jobs.get(taskId) else [] |
71 | return f"job 信息->{job}" | ||
72 | 71 | ||
73 | 72 | ||
74 | async def scrawl_and_save(taskParam: SpiderParams): | 73 | async def scrawl_and_save(taskParam: SpiderParams): |
75 | try: | 74 | try: |
76 | # 执行爬虫获取结果 给下面一行代码添加 try cache try 捕获异常 | 75 | # 执行爬虫获取结果 给下面一行代码添加 try cache try 捕获异常 |
77 | results = await startBrowser(taskParam) | 76 | results = await get_score_data(taskParam) |
78 | except Exception as e: | 77 | except Exception as e: |
79 | logger.info(f"爬虫重试情况:{startBrowser.statistics}") | ||
80 | logger.error(f"爬虫任务执行失败,失败原因:{e}") | 78 | logger.error(f"爬虫任务执行失败,失败原因:{e}") |
81 | return | 79 | return |
82 | asyncTasks = (save_or_update(item) for item in results) | 80 | asyncTasks = (save_or_update(item) for item in results) | ... | ... |
... | @@ -18,18 +18,12 @@ from tenacity import ( | ... | @@ -18,18 +18,12 @@ from tenacity import ( |
18 | logger = logging.getLogger(__name__) | 18 | logger = logging.getLogger(__name__) |
19 | 19 | ||
20 | 20 | ||
21 | @retry( | ||
22 | stop=stop_after_attempt(3), | ||
23 | before_sleep=before_sleep_log(logger, logging.DEBUG), | ||
24 | wait=wait_exponential(multiplier=1, min=1, max=10), | ||
25 | after=after_log(logger, logging.DEBUG), | ||
26 | ) | ||
27 | async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: | 21 | async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: |
28 | time_range = spiderParam.get_time_range() | 22 | time_range = spiderParam.get_time_range() |
29 | url = spiderParam.url | 23 | url = spiderParam.url |
30 | scrawl_types = spiderParam.target_type | 24 | scrawl_types = spiderParam.target_type |
31 | async with async_playwright() as p: | 25 | async with async_playwright() as p: |
32 | browser = await p.firefox.launch(headless=True) | 26 | browser = await p.firefox.launch(headless=False) |
33 | # 创建指定个数的浏览器页面 | 27 | # 创建指定个数的浏览器页面 |
34 | pages = await tqdm_asyncio.gather( | 28 | pages = await tqdm_asyncio.gather( |
35 | *(browser.new_page() for _ in range(len(time_range))) | 29 | *(browser.new_page() for _ in range(len(time_range))) |
... | @@ -51,14 +45,14 @@ async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: | ... | @@ -51,14 +45,14 @@ async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: |
51 | 45 | ||
52 | 46 | ||
53 | async def hand_one_data( | 47 | async def hand_one_data( |
54 | targetDate: str, page: Page, scrawl_types: List[TypeEnum], url: str | 48 | targetDate: str, page: Page, scrawl_types: List[TypeEnum], url: str |
55 | ) -> List[GuoDuoSpiderResult]: | 49 | ) -> List[GuoDuoSpiderResult]: |
56 | # 把date这个日期对象解析为年、月、日 | 50 | # 把date这个日期对象解析为年、月、日 |
57 | year, month, day = targetDate.split("-") | 51 | year, month, day = targetDate.split("-") |
58 | day = day.replace("0", "") | 52 | day = day.replace("0", "") |
59 | month = int(month) - 1 | 53 | month = int(month) - 1 |
60 | await page.goto(url) | 54 | await page.goto(url) |
61 | # 等待遮罩层消失 | 55 | # 点击 |
62 | await page.click('//*[@id="currentDateTxt"]') | 56 | await page.click('//*[@id="currentDateTxt"]') |
63 | data_css_selector = ( | 57 | data_css_selector = ( |
64 | f'#day span.pika-button.pika-day[type="button"]' | 58 | f'#day span.pika-button.pika-day[type="button"]' |
... | @@ -70,6 +64,10 @@ async def hand_one_data( | ... | @@ -70,6 +64,10 @@ async def hand_one_data( |
70 | # 判断指定元素是否存在如果不存在就返回空的[] | 64 | # 判断指定元素是否存在如果不存在就返回空的[] |
71 | if not await doc.is_visible(): | 65 | if not await doc.is_visible(): |
72 | return [] | 66 | return [] |
67 | # 检查日期是否可点击 | ||
68 | if await doc.get_attribute('class') == 'pika-button pika-day is-disabled': | ||
69 | logger.warning(f"日期 {targetDate} 被禁用,无法点击") | ||
70 | return [] | ||
73 | # 点击指定日期 | 71 | # 点击指定日期 |
74 | await doc.click() | 72 | await doc.click() |
75 | # 最后一步修正定位 | 73 | # 最后一步修正定位 |
... | @@ -90,7 +88,7 @@ async def hand_one_data( | ... | @@ -90,7 +88,7 @@ async def hand_one_data( |
90 | 88 | ||
91 | 89 | ||
92 | async def get_data( | 90 | async def get_data( |
93 | targetPage: Page, scrawl_type: int, target_time: str | 91 | targetPage: Page, scrawl_type: int, target_time: str |
94 | ) -> Optional[List[GuoDuoSpiderResult]]: | 92 | ) -> Optional[List[GuoDuoSpiderResult]]: |
95 | target_data_css_selector = ".rank-box" | 93 | target_data_css_selector = ".rank-box" |
96 | table = targetPage.locator(target_data_css_selector) | 94 | table = targetPage.locator(target_data_css_selector) |
... | @@ -122,12 +120,22 @@ async def get_data( | ... | @@ -122,12 +120,22 @@ async def get_data( |
122 | return result | 120 | return result |
123 | 121 | ||
124 | 122 | ||
123 | @retry( | ||
124 | stop=stop_after_attempt(3), | ||
125 | before_sleep=before_sleep_log(logger, logging.DEBUG), | ||
126 | wait=wait_exponential(multiplier=1, min=1, max=10), | ||
127 | after=after_log(logger, logging.DEBUG), | ||
128 | ) | ||
129 | async def retry_job(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: | ||
130 | return await startBrowser(spiderParam) | ||
131 | |||
132 | |||
125 | if __name__ == "__main__": | 133 | if __name__ == "__main__": |
126 | # 获取程序开始执行的时间 | 134 | # 获取程序开始执行的时间 |
127 | start_time = time.time() | 135 | start_time = time.time() |
128 | param = SpiderParams( | 136 | param = SpiderParams( |
129 | startDate="2024-01-02", | 137 | startDate="2024-12-22", |
130 | endDate="2024-01-15", | 138 | endDate="2024-12-23", |
131 | target_type=[ | 139 | target_type=[ |
132 | TypeEnum.ANIME, | 140 | TypeEnum.ANIME, |
133 | TypeEnum.EPISODES, | 141 | TypeEnum.EPISODES, |
... | @@ -136,6 +144,6 @@ if __name__ == "__main__": | ... | @@ -136,6 +144,6 @@ if __name__ == "__main__": |
136 | ], | 144 | ], |
137 | url="http://guduodata.com/", | 145 | url="http://guduodata.com/", |
138 | ) | 146 | ) |
139 | res = asyncio.run(startBrowser(param)) | 147 | res = asyncio.run(retry_job(param)) |
140 | print(f"代码重试情况:{startBrowser.statistics}") | 148 | print(f"代码重试情况:{retry_job.statistics}") |
141 | print(f"程序执行耗时时间:{(time.time() - start_time)/1000}长度为:{len(res)}") | 149 | print(f"程序执行耗时时间:{(time.time() - start_time) / 1000}长度为:{len(res)}") | ... | ... |
app/spider/http_spider.py
0 → 100644
1 | import asyncio | ||
2 | import datetime | ||
3 | import time | ||
4 | from typing import List, Dict, Tuple | ||
5 | import aiohttp | ||
6 | from tenacity import retry, stop_after_attempt, before_sleep_log, wait_exponential, after_log | ||
7 | import logging | ||
8 | from app.schemas.config_schema import UrlTemplateInfo | ||
9 | from app.schemas.spider_schema import SpiderParams, TypeEnum, GuoDuoSpiderResult, ResponseModel | ||
10 | from app.config.app_config import getAppConfig | ||
11 | from superstream import Stream | ||
12 | |||
13 | logger = logging.getLogger(__name__) | ||
14 | |||
15 | |||
16 | def build_request_url(spiderParam: SpiderParams) -> Dict[TypeEnum, List[Tuple[str, str]]]: | ||
17 | base_url = spiderParam.url | ||
18 | targets = spiderParam.target_type | ||
19 | templates: List[UrlTemplateInfo] = getAppConfig().urlTemplate | ||
20 | templates_group: Dict[int, str] = Stream(templates).to_map(lambda x: x.type, lambda x: x.template) | ||
21 | # 时间范围中的每个时间 | ||
22 | time_range = spiderParam.get_time_range() | ||
23 | target_urls: Dict[TypeEnum, List[Tuple[str, str]]] = {} | ||
24 | for target in targets: | ||
25 | template = templates_group[target.value] | ||
26 | url_type = [(base_url + template.format(date=item), item) for item in time_range] | ||
27 | target_urls[target] = url_type | ||
28 | return target_urls | ||
29 | |||
30 | |||
31 | @retry( | ||
32 | stop=stop_after_attempt(3), | ||
33 | before_sleep=before_sleep_log(logger, logging.DEBUG), | ||
34 | wait=wait_exponential(multiplier=1, min=1, max=10), | ||
35 | after=after_log(logger, logging.DEBUG), | ||
36 | ) | ||
37 | async def fetch(session, url): | ||
38 | async with session.get(url) as response: | ||
39 | return await response.json() | ||
40 | |||
41 | |||
42 | async def fetch_and_parse(urls: List[Tuple[str, str]]) -> Dict[str, ResponseModel]: | ||
43 | async with aiohttp.ClientSession() as session: | ||
44 | requests = [fetch(session, url[0]) for url in urls] | ||
45 | results = await asyncio.gather(*requests) | ||
46 | return_data = {} | ||
47 | for k, v in zip(urls, results): | ||
48 | return_data[k[1]] = ResponseModel(**v) | ||
49 | return return_data | ||
50 | |||
51 | |||
52 | def parse_response_to_spider_result(guduo_responses: Dict[str, ResponseModel], dataType: TypeEnum) -> List[GuoDuoSpiderResult]: | ||
53 | results: List[GuoDuoSpiderResult] = [] | ||
54 | for k, v in guduo_responses.items(): | ||
55 | time_request = k | ||
56 | targetType = dataType | ||
57 | platform = 'ALL' | ||
58 | createTime = datetime.datetime.now() | ||
59 | for item in v.data: | ||
60 | sourceName = item.name | ||
61 | score = item.gdiFloat | ||
62 | results.append(GuoDuoSpiderResult(time=time_request, targetType=targetType, | ||
63 | platform=platform, sourceName=sourceName, | ||
64 | score=score, createTime=createTime)) | ||
65 | return results | ||
66 | |||
67 | |||
68 | async def batch_fetch_and_parse(urls: List[Tuple[str, str]], enum: TypeEnum): | ||
69 | result = await fetch_and_parse(urls) | ||
70 | return parse_response_to_spider_result(result, enum) | ||
71 | |||
72 | |||
73 | async def get_score_data(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: | ||
74 | """ | ||
75 | 获取评分数据 | ||
76 | :param spiderParam: 爬取参数 | ||
77 | :return: | ||
78 | """ | ||
79 | url_infos = build_request_url(spiderParam) | ||
80 | tasks = [batch_fetch_and_parse(urls, key) for key, urls in url_infos.items()] | ||
81 | results = await asyncio.gather(*tasks) | ||
82 | spider_datas = [item for sublist in results for item in sublist] | ||
83 | return spider_datas | ||
84 | |||
85 | |||
86 | if __name__ == '__main__': | ||
87 | # 获取程序开始执行的时间 | ||
88 | start_time = time.time() | ||
89 | param = SpiderParams( | ||
90 | startDate="2024-12-22", | ||
91 | endDate="2024-12-23", | ||
92 | target_type=[ | ||
93 | TypeEnum.ANIME, | ||
94 | TypeEnum.EPISODES, | ||
95 | TypeEnum.MOVIE, | ||
96 | TypeEnum.VARIETY, | ||
97 | ], | ||
98 | url="http://guduodata.com", | ||
99 | ) | ||
100 | res = asyncio.run(get_score_data(param)) | ||
101 | print(f"程序执行耗时时间:{(time.time() - start_time) / 1000}长度为:{len(res)}") | ||
102 | print(res) |
-
Please register or sign in to post a comment