update
Showing
8 changed files
with
40 additions
and
51 deletions
... | @@ -27,7 +27,7 @@ getSpider: | ... | @@ -27,7 +27,7 @@ getSpider: |
27 | urlTemplate: | 27 | urlTemplate: |
28 | # 1 电视剧 2 电影 3 综艺 4 动漫 | 28 | # 1 电视剧 2 电影 3 综艺 4 动漫 |
29 | - type: 1 | 29 | - type: 1 |
30 | template: "/m/v3/billboard/list?type=DAILY&category=ALL_ANIME&date={date}&attach=gdi&orderTitle=gdi&platformId=0" | 30 | template: "/m/v3/billboard/list?type=DAILY&category=DRAMA&date={date}&attach=gdi&orderTitle=gdi&platformId=0" |
31 | - type: 2 | 31 | - type: 2 |
32 | template: "/m/v3/billboard/list?type=DAILY&category=NETWORK_MOVIE&date={date}&attach=gdi&orderTitle=gdi&platformId=0" | 32 | template: "/m/v3/billboard/list?type=DAILY&category=NETWORK_MOVIE&date={date}&attach=gdi&orderTitle=gdi&platformId=0" |
33 | - type: 3 | 33 | - type: 3 | ... | ... |
... | @@ -30,6 +30,22 @@ async def lifespan(app: FastAPI): | ... | @@ -30,6 +30,22 @@ async def lifespan(app: FastAPI): |
30 | logging.info("apscheduler启动完成") | 30 | logging.info("apscheduler启动完成") |
31 | await init() | 31 | await init() |
32 | logging.info("初始化数据库完成") | 32 | logging.info("初始化数据库完成") |
33 | # 获取程序开始执行的时间 | ||
34 | start_time = time.time() | ||
35 | param = SpiderParams( | ||
36 | startDate="2024-12-20", | ||
37 | endDate="2024-12-21", | ||
38 | target_type=[ | ||
39 | TypeEnum.ANIME, | ||
40 | TypeEnum.EPISODES, | ||
41 | TypeEnum.MOVIE, | ||
42 | TypeEnum.VARIETY, | ||
43 | ], | ||
44 | url="http://guduodata.com", | ||
45 | ) | ||
46 | print(param.get_time_range()) | ||
47 | asyncio.run(scrawl_and_save(param)) | ||
48 | print(f"程序执行耗时时间:{(time.time() - start_time) / 1000}") | ||
33 | yield | 49 | yield |
34 | # 在异步上下文管理器中,"退出上下文"时清理机器学习模型,释放资源 | 50 | # 在异步上下文管理器中,"退出上下文"时清理机器学习模型,释放资源 |
35 | scheduler.shutdown() | 51 | scheduler.shutdown() | ... | ... |
This diff could not be displayed because it is too large.
1 | # router/users.py | 1 | # router/users.py |
2 | import logging | 2 | import logging |
3 | from typing import Dict, Any | 3 | from typing import Dict, Any |
4 | |||
5 | from fastapi import APIRouter, Path | 4 | from fastapi import APIRouter, Path |
6 | |||
7 | from app.schemas.spider_schema import SpiderParams, TaskInfo | 5 | from app.schemas.spider_schema import SpiderParams, TaskInfo |
8 | from app.service.spider_job_service import add_job, get_job_info, add_guduo_job | 6 | from app.service.spider_job_service import add_job, get_job_info, add_guduo_job |
9 | from app.spider import guduo_spider | 7 | from app.spider import guduo_spider |
... | @@ -35,7 +33,7 @@ async def add_test_job(taskId: int = Path(..., description="要查询的任务 | ... | @@ -35,7 +33,7 @@ async def add_test_job(taskId: int = Path(..., description="要查询的任务 |
35 | 33 | ||
36 | taskInfo = TaskInfo(taskId=taskId, taskStart=True, | 34 | taskInfo = TaskInfo(taskId=taskId, taskStart=True, |
37 | executeStrategy='0 0/1 * * * ?', | 35 | executeStrategy='0 0/1 * * * ?', |
38 | url='www.baidu.com') | 36 | url='www.baidu.com', taskMd5='test') |
39 | add_job(['test'], taskInfo, test) | 37 | add_job(['test'], taskInfo, test) |
40 | 38 | ||
41 | 39 | ... | ... |
... | @@ -29,8 +29,6 @@ class SafeDict(Generic[K, V]): | ... | @@ -29,8 +29,6 @@ class SafeDict(Generic[K, V]): |
29 | return self.data.get(key) | 29 | return self.data.get(key) |
30 | 30 | ||
31 | 31 | ||
32 | |||
33 | |||
34 | if __name__ == '__main__': | 32 | if __name__ == '__main__': |
35 | test: SafeDict[int, str] = SafeDict() | 33 | test: SafeDict[int, str] = SafeDict() |
36 | test.put(1, '1') | 34 | test.put(1, '1') | ... | ... |
1 | import asyncio | 1 | import asyncio |
2 | import logging | 2 | import logging |
3 | import time | ||
3 | from datetime import datetime | 4 | from datetime import datetime |
4 | from multiprocessing import Manager | 5 | from typing import List, Any |
5 | from typing import List, Any, Dict | ||
6 | from app.schemas.safe_contrainer import SafeDict | ||
7 | from app.job.job import scheduler | 6 | from app.job.job import scheduler |
8 | from app.model.mysql_model import SpiderModel | 7 | from app.model.mysql_model import SpiderModel |
8 | from app.schemas.safe_contrainer import SafeDict | ||
9 | from app.schemas.spider_schema import ( | 9 | from app.schemas.spider_schema import ( |
10 | ApschedulerJob, | 10 | ApschedulerJob, |
11 | TaskInfo, | 11 | TaskInfo, |
12 | SpiderParams, | 12 | SpiderParams, |
13 | GuoDuoSpiderResult, | 13 | GuoDuoSpiderResult, TypeEnum, |
14 | ) | 14 | ) |
15 | from app.spider.http_spider import get_score_data | 15 | from app.spider.http_spider import get_score_data |
16 | 16 | ||
... | @@ -73,10 +73,12 @@ def get_job_info(taskId: int = None) -> List[ApschedulerJob]: | ... | @@ -73,10 +73,12 @@ def get_job_info(taskId: int = None) -> List[ApschedulerJob]: |
73 | async def scrawl_and_save(taskParam: SpiderParams): | 73 | async def scrawl_and_save(taskParam: SpiderParams): |
74 | try: | 74 | try: |
75 | # 执行爬虫获取结果 给下面一行代码添加 try cache try 捕获异常 | 75 | # 执行爬虫获取结果 给下面一行代码添加 try cache try 捕获异常 |
76 | logger.info("开始获取网站数据") | ||
76 | results = await get_score_data(taskParam) | 77 | results = await get_score_data(taskParam) |
77 | except Exception as e: | 78 | except Exception as e: |
78 | logger.error(f"爬虫任务执行失败,失败原因:{e}") | 79 | logger.error(f"爬虫任务执行失败,失败原因:{e}") |
79 | return | 80 | return |
81 | logger.info(f"开始保存数据") | ||
80 | asyncTasks = (save_or_update(item) for item in results) | 82 | asyncTasks = (save_or_update(item) for item in results) |
81 | await asyncio.gather(*asyncTasks) | 83 | await asyncio.gather(*asyncTasks) |
82 | logger.info(f"爬虫任务执行完成,爬取到数据{len(results)}条 保存到数据库完成") | 84 | logger.info(f"爬虫任务执行完成,爬取到数据{len(results)}条 保存到数据库完成") |
... | @@ -108,3 +110,6 @@ async def save_or_update(result: GuoDuoSpiderResult): | ... | @@ -108,3 +110,6 @@ async def save_or_update(result: GuoDuoSpiderResult): |
108 | updateTime=datetime.now(), | 110 | updateTime=datetime.now(), |
109 | ) | 111 | ) |
110 | await obj.save() | 112 | await obj.save() |
113 | |||
114 | |||
115 | ... | ... |
... | @@ -18,12 +18,18 @@ from tenacity import ( | ... | @@ -18,12 +18,18 @@ from tenacity import ( |
18 | logger = logging.getLogger(__name__) | 18 | logger = logging.getLogger(__name__) |
19 | 19 | ||
20 | 20 | ||
21 | @retry( | ||
22 | stop=stop_after_attempt(3), | ||
23 | before_sleep=before_sleep_log(logger, logging.DEBUG), | ||
24 | wait=wait_exponential(multiplier=1, min=1, max=10), | ||
25 | after=after_log(logger, logging.DEBUG), | ||
26 | ) | ||
21 | async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: | 27 | async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: |
22 | time_range = spiderParam.get_time_range() | 28 | time_range = spiderParam.get_time_range() |
23 | url = spiderParam.url | 29 | url = spiderParam.url |
24 | scrawl_types = spiderParam.target_type | 30 | scrawl_types = spiderParam.target_type |
25 | async with async_playwright() as p: | 31 | async with async_playwright() as p: |
26 | browser = await p.firefox.launch(headless=False) | 32 | browser = await p.firefox.launch(headless=True) |
27 | # 创建指定个数的浏览器页面 | 33 | # 创建指定个数的浏览器页面 |
28 | pages = await tqdm_asyncio.gather( | 34 | pages = await tqdm_asyncio.gather( |
29 | *(browser.new_page() for _ in range(len(time_range))) | 35 | *(browser.new_page() for _ in range(len(time_range))) |
... | @@ -52,7 +58,7 @@ async def hand_one_data( | ... | @@ -52,7 +58,7 @@ async def hand_one_data( |
52 | day = day.replace("0", "") | 58 | day = day.replace("0", "") |
53 | month = int(month) - 1 | 59 | month = int(month) - 1 |
54 | await page.goto(url) | 60 | await page.goto(url) |
55 | # 点击 | 61 | # 等待遮罩层消失 |
56 | await page.click('//*[@id="currentDateTxt"]') | 62 | await page.click('//*[@id="currentDateTxt"]') |
57 | data_css_selector = ( | 63 | data_css_selector = ( |
58 | f'#day span.pika-button.pika-day[type="button"]' | 64 | f'#day span.pika-button.pika-day[type="button"]' |
... | @@ -64,10 +70,6 @@ async def hand_one_data( | ... | @@ -64,10 +70,6 @@ async def hand_one_data( |
64 | # 判断指定元素是否存在如果不存在就返回空的[] | 70 | # 判断指定元素是否存在如果不存在就返回空的[] |
65 | if not await doc.is_visible(): | 71 | if not await doc.is_visible(): |
66 | return [] | 72 | return [] |
67 | # 检查日期是否可点击 | ||
68 | if await doc.get_attribute('class') == 'pika-button pika-day is-disabled': | ||
69 | logger.warning(f"日期 {targetDate} 被禁用,无法点击") | ||
70 | return [] | ||
71 | # 点击指定日期 | 73 | # 点击指定日期 |
72 | await doc.click() | 74 | await doc.click() |
73 | # 最后一步修正定位 | 75 | # 最后一步修正定位 |
... | @@ -120,22 +122,12 @@ async def get_data( | ... | @@ -120,22 +122,12 @@ async def get_data( |
120 | return result | 122 | return result |
121 | 123 | ||
122 | 124 | ||
123 | @retry( | ||
124 | stop=stop_after_attempt(3), | ||
125 | before_sleep=before_sleep_log(logger, logging.DEBUG), | ||
126 | wait=wait_exponential(multiplier=1, min=1, max=10), | ||
127 | after=after_log(logger, logging.DEBUG), | ||
128 | ) | ||
129 | async def retry_job(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: | ||
130 | return await startBrowser(spiderParam) | ||
131 | |||
132 | |||
133 | if __name__ == "__main__": | 125 | if __name__ == "__main__": |
134 | # 获取程序开始执行的时间 | 126 | # 获取程序开始执行的时间 |
135 | start_time = time.time() | 127 | start_time = time.time() |
136 | param = SpiderParams( | 128 | param = SpiderParams( |
137 | startDate="2024-12-22", | 129 | startDate="2024-01-02", |
138 | endDate="2024-12-23", | 130 | endDate="2024-01-15", |
139 | target_type=[ | 131 | target_type=[ |
140 | TypeEnum.ANIME, | 132 | TypeEnum.ANIME, |
141 | TypeEnum.EPISODES, | 133 | TypeEnum.EPISODES, |
... | @@ -144,6 +136,6 @@ if __name__ == "__main__": | ... | @@ -144,6 +136,6 @@ if __name__ == "__main__": |
144 | ], | 136 | ], |
145 | url="http://guduodata.com/", | 137 | url="http://guduodata.com/", |
146 | ) | 138 | ) |
147 | res = asyncio.run(retry_job(param)) | 139 | res = asyncio.run(startBrowser(param)) |
148 | print(f"代码重试情况:{retry_job.statistics}") | 140 | print(f"代码重试情况:{startBrowser.statistics}") |
149 | print(f"程序执行耗时时间:{(time.time() - start_time) / 1000}长度为:{len(res)}") | 141 | print(f"程序执行耗时时间:{(time.time() - start_time)/1000}长度为:{len(res)}") | ... | ... |
... | @@ -88,23 +88,3 @@ async def get_score_data(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: | ... | @@ -88,23 +88,3 @@ async def get_score_data(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: |
88 | spider_datas = [item for sublist in results for item in sublist] | 88 | spider_datas = [item for sublist in results for item in sublist] |
89 | logger.info(f"爬取数据结束,共爬取{len(spider_datas)}条数据") | 89 | logger.info(f"爬取数据结束,共爬取{len(spider_datas)}条数据") |
90 | return spider_datas | 90 | return spider_datas |
91 | |||
92 | |||
93 | if __name__ == '__main__': | ||
94 | # 获取程序开始执行的时间 | ||
95 | start_time = time.time() | ||
96 | param = SpiderParams( | ||
97 | startDate="2024-12-22", | ||
98 | endDate="2024-12-23", | ||
99 | target_type=[ | ||
100 | TypeEnum.ANIME, | ||
101 | TypeEnum.EPISODES, | ||
102 | TypeEnum.MOVIE, | ||
103 | TypeEnum.VARIETY, | ||
104 | ], | ||
105 | url="http://guduodata.com", | ||
106 | ) | ||
107 | print(param.get_time_range()) | ||
108 | res = asyncio.run(get_score_data(param)) | ||
109 | print(f"程序执行耗时时间:{(time.time() - start_time) / 1000}长度为:{len(res)}") | ||
110 | print(res) | ... | ... |
-
Please register or sign in to post a comment