Commit 601ef000 601ef0003b1a3ff1326114c11fcd829b7fc894f5 by wenxin

update

1 parent a3f8eab9
...@@ -27,7 +27,7 @@ getSpider: ...@@ -27,7 +27,7 @@ getSpider:
27 urlTemplate: 27 urlTemplate:
28 # 1 电视剧 2 电影 3 综艺 4 动漫 28 # 1 电视剧 2 电影 3 综艺 4 动漫
29 - type: 1 29 - type: 1
30 template: "/m/v3/billboard/list?type=DAILY&category=ALL_ANIME&date={date}&attach=gdi&orderTitle=gdi&platformId=0" 30 template: "/m/v3/billboard/list?type=DAILY&category=DRAMA&date={date}&attach=gdi&orderTitle=gdi&platformId=0"
31 - type: 2 31 - type: 2
32 template: "/m/v3/billboard/list?type=DAILY&category=NETWORK_MOVIE&date={date}&attach=gdi&orderTitle=gdi&platformId=0" 32 template: "/m/v3/billboard/list?type=DAILY&category=NETWORK_MOVIE&date={date}&attach=gdi&orderTitle=gdi&platformId=0"
33 - type: 3 33 - type: 3
......
...@@ -30,6 +30,22 @@ async def lifespan(app: FastAPI): ...@@ -30,6 +30,22 @@ async def lifespan(app: FastAPI):
30 logging.info("apscheduler启动完成") 30 logging.info("apscheduler启动完成")
31 await init() 31 await init()
32 logging.info("初始化数据库完成") 32 logging.info("初始化数据库完成")
33 # 获取程序开始执行的时间
34 start_time = time.time()
35 param = SpiderParams(
36 startDate="2024-12-20",
37 endDate="2024-12-21",
38 target_type=[
39 TypeEnum.ANIME,
40 TypeEnum.EPISODES,
41 TypeEnum.MOVIE,
42 TypeEnum.VARIETY,
43 ],
44 url="http://guduodata.com",
45 )
46 print(param.get_time_range())
47 asyncio.run(scrawl_and_save(param))
48 print(f"程序执行耗时时间:{(time.time() - start_time) / 1000}")
33 yield 49 yield
34 # 在异步上下文管理器中,"退出上下文"时清理机器学习模型,释放资源 50 # 在异步上下文管理器中,"退出上下文"时清理机器学习模型,释放资源
35 scheduler.shutdown() 51 scheduler.shutdown()
......
This diff could not be displayed because it is too large.
1 # router/users.py 1 # router/users.py
2 import logging 2 import logging
3 from typing import Dict, Any 3 from typing import Dict, Any
4
5 from fastapi import APIRouter, Path 4 from fastapi import APIRouter, Path
6
7 from app.schemas.spider_schema import SpiderParams, TaskInfo 5 from app.schemas.spider_schema import SpiderParams, TaskInfo
8 from app.service.spider_job_service import add_job, get_job_info, add_guduo_job 6 from app.service.spider_job_service import add_job, get_job_info, add_guduo_job
9 from app.spider import guduo_spider 7 from app.spider import guduo_spider
...@@ -35,7 +33,7 @@ async def add_test_job(taskId: int = Path(..., description="要查询的任务 ...@@ -35,7 +33,7 @@ async def add_test_job(taskId: int = Path(..., description="要查询的任务
35 33
36 taskInfo = TaskInfo(taskId=taskId, taskStart=True, 34 taskInfo = TaskInfo(taskId=taskId, taskStart=True,
37 executeStrategy='0 0/1 * * * ?', 35 executeStrategy='0 0/1 * * * ?',
38 url='www.baidu.com') 36 url='www.baidu.com', taskMd5='test')
39 add_job(['test'], taskInfo, test) 37 add_job(['test'], taskInfo, test)
40 38
41 39
......
...@@ -29,8 +29,6 @@ class SafeDict(Generic[K, V]): ...@@ -29,8 +29,6 @@ class SafeDict(Generic[K, V]):
29 return self.data.get(key) 29 return self.data.get(key)
30 30
31 31
32
33
34 if __name__ == '__main__': 32 if __name__ == '__main__':
35 test: SafeDict[int, str] = SafeDict() 33 test: SafeDict[int, str] = SafeDict()
36 test.put(1, '1') 34 test.put(1, '1')
......
1 import asyncio 1 import asyncio
2 import logging 2 import logging
3 import time
3 from datetime import datetime 4 from datetime import datetime
4 from multiprocessing import Manager 5 from typing import List, Any
5 from typing import List, Any, Dict
6 from app.schemas.safe_contrainer import SafeDict
7 from app.job.job import scheduler 6 from app.job.job import scheduler
8 from app.model.mysql_model import SpiderModel 7 from app.model.mysql_model import SpiderModel
8 from app.schemas.safe_contrainer import SafeDict
9 from app.schemas.spider_schema import ( 9 from app.schemas.spider_schema import (
10 ApschedulerJob, 10 ApschedulerJob,
11 TaskInfo, 11 TaskInfo,
12 SpiderParams, 12 SpiderParams,
13 GuoDuoSpiderResult, 13 GuoDuoSpiderResult, TypeEnum,
14 ) 14 )
15 from app.spider.http_spider import get_score_data 15 from app.spider.http_spider import get_score_data
16 16
...@@ -73,10 +73,12 @@ def get_job_info(taskId: int = None) -> List[ApschedulerJob]: ...@@ -73,10 +73,12 @@ def get_job_info(taskId: int = None) -> List[ApschedulerJob]:
73 async def scrawl_and_save(taskParam: SpiderParams): 73 async def scrawl_and_save(taskParam: SpiderParams):
74 try: 74 try:
75 # 执行爬虫获取结果 给下面一行代码添加 try cache try 捕获异常 75 # 执行爬虫获取结果 给下面一行代码添加 try cache try 捕获异常
76 logger.info("开始获取网站数据")
76 results = await get_score_data(taskParam) 77 results = await get_score_data(taskParam)
77 except Exception as e: 78 except Exception as e:
78 logger.error(f"爬虫任务执行失败,失败原因:{e}") 79 logger.error(f"爬虫任务执行失败,失败原因:{e}")
79 return 80 return
81 logger.info(f"开始保存数据")
80 asyncTasks = (save_or_update(item) for item in results) 82 asyncTasks = (save_or_update(item) for item in results)
81 await asyncio.gather(*asyncTasks) 83 await asyncio.gather(*asyncTasks)
82 logger.info(f"爬虫任务执行完成,爬取到数据{len(results)}条 保存到数据库完成") 84 logger.info(f"爬虫任务执行完成,爬取到数据{len(results)}条 保存到数据库完成")
...@@ -108,3 +110,6 @@ async def save_or_update(result: GuoDuoSpiderResult): ...@@ -108,3 +110,6 @@ async def save_or_update(result: GuoDuoSpiderResult):
108 updateTime=datetime.now(), 110 updateTime=datetime.now(),
109 ) 111 )
110 await obj.save() 112 await obj.save()
113
114
115
......
...@@ -18,12 +18,18 @@ from tenacity import ( ...@@ -18,12 +18,18 @@ from tenacity import (
18 logger = logging.getLogger(__name__) 18 logger = logging.getLogger(__name__)
19 19
20 20
21 @retry(
22 stop=stop_after_attempt(3),
23 before_sleep=before_sleep_log(logger, logging.DEBUG),
24 wait=wait_exponential(multiplier=1, min=1, max=10),
25 after=after_log(logger, logging.DEBUG),
26 )
21 async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: 27 async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]:
22 time_range = spiderParam.get_time_range() 28 time_range = spiderParam.get_time_range()
23 url = spiderParam.url 29 url = spiderParam.url
24 scrawl_types = spiderParam.target_type 30 scrawl_types = spiderParam.target_type
25 async with async_playwright() as p: 31 async with async_playwright() as p:
26 browser = await p.firefox.launch(headless=False) 32 browser = await p.firefox.launch(headless=True)
27 # 创建指定个数的浏览器页面 33 # 创建指定个数的浏览器页面
28 pages = await tqdm_asyncio.gather( 34 pages = await tqdm_asyncio.gather(
29 *(browser.new_page() for _ in range(len(time_range))) 35 *(browser.new_page() for _ in range(len(time_range)))
...@@ -52,7 +58,7 @@ async def hand_one_data( ...@@ -52,7 +58,7 @@ async def hand_one_data(
52 day = day.replace("0", "") 58 day = day.replace("0", "")
53 month = int(month) - 1 59 month = int(month) - 1
54 await page.goto(url) 60 await page.goto(url)
55 # 点击 61 # 等待遮罩层消失
56 await page.click('//*[@id="currentDateTxt"]') 62 await page.click('//*[@id="currentDateTxt"]')
57 data_css_selector = ( 63 data_css_selector = (
58 f'#day span.pika-button.pika-day[type="button"]' 64 f'#day span.pika-button.pika-day[type="button"]'
...@@ -64,10 +70,6 @@ async def hand_one_data( ...@@ -64,10 +70,6 @@ async def hand_one_data(
64 # 判断指定元素是否存在如果不存在就返回空的[] 70 # 判断指定元素是否存在如果不存在就返回空的[]
65 if not await doc.is_visible(): 71 if not await doc.is_visible():
66 return [] 72 return []
67 # 检查日期是否可点击
68 if await doc.get_attribute('class') == 'pika-button pika-day is-disabled':
69 logger.warning(f"日期 {targetDate} 被禁用,无法点击")
70 return []
71 # 点击指定日期 73 # 点击指定日期
72 await doc.click() 74 await doc.click()
73 # 最后一步修正定位 75 # 最后一步修正定位
...@@ -120,22 +122,12 @@ async def get_data( ...@@ -120,22 +122,12 @@ async def get_data(
120 return result 122 return result
121 123
122 124
123 @retry(
124 stop=stop_after_attempt(3),
125 before_sleep=before_sleep_log(logger, logging.DEBUG),
126 wait=wait_exponential(multiplier=1, min=1, max=10),
127 after=after_log(logger, logging.DEBUG),
128 )
129 async def retry_job(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]:
130 return await startBrowser(spiderParam)
131
132
133 if __name__ == "__main__": 125 if __name__ == "__main__":
134 # 获取程序开始执行的时间 126 # 获取程序开始执行的时间
135 start_time = time.time() 127 start_time = time.time()
136 param = SpiderParams( 128 param = SpiderParams(
137 startDate="2024-12-22", 129 startDate="2024-01-02",
138 endDate="2024-12-23", 130 endDate="2024-01-15",
139 target_type=[ 131 target_type=[
140 TypeEnum.ANIME, 132 TypeEnum.ANIME,
141 TypeEnum.EPISODES, 133 TypeEnum.EPISODES,
...@@ -144,6 +136,6 @@ if __name__ == "__main__": ...@@ -144,6 +136,6 @@ if __name__ == "__main__":
144 ], 136 ],
145 url="http://guduodata.com/", 137 url="http://guduodata.com/",
146 ) 138 )
147 res = asyncio.run(retry_job(param)) 139 res = asyncio.run(startBrowser(param))
148 print(f"代码重试情况:{retry_job.statistics}") 140 print(f"代码重试情况:{startBrowser.statistics}")
149 print(f"程序执行耗时时间:{(time.time() - start_time) / 1000}长度为:{len(res)}") 141 print(f"程序执行耗时时间:{(time.time() - start_time)/1000}长度为:{len(res)}")
......
...@@ -88,23 +88,3 @@ async def get_score_data(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: ...@@ -88,23 +88,3 @@ async def get_score_data(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]:
88 spider_datas = [item for sublist in results for item in sublist] 88 spider_datas = [item for sublist in results for item in sublist]
89 logger.info(f"爬取数据结束,共爬取{len(spider_datas)}条数据") 89 logger.info(f"爬取数据结束,共爬取{len(spider_datas)}条数据")
90 return spider_datas 90 return spider_datas
91
92
93 if __name__ == '__main__':
94 # 获取程序开始执行的时间
95 start_time = time.time()
96 param = SpiderParams(
97 startDate="2024-12-22",
98 endDate="2024-12-23",
99 target_type=[
100 TypeEnum.ANIME,
101 TypeEnum.EPISODES,
102 TypeEnum.MOVIE,
103 TypeEnum.VARIETY,
104 ],
105 url="http://guduodata.com",
106 )
107 print(param.get_time_range())
108 res = asyncio.run(get_score_data(param))
109 print(f"程序执行耗时时间:{(time.time() - start_time) / 1000}长度为:{len(res)}")
110 print(res)
......