Commit 4c03d0d2 4c03d0d2ad43b5a1b64266e4a6c5b2313a0a84f3 by 文鑫

bug修复

1 parent c022d21e
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
2 <module type="PYTHON_MODULE" version="4"> 2 <module type="PYTHON_MODULE" version="4">
3 <component name="NewModuleRootManager"> 3 <component name="NewModuleRootManager">
4 <content url="file://$MODULE_DIR$" /> 4 <content url="file://$MODULE_DIR$" />
5 <orderEntry type="jdk" jdkName="Spider" jdkType="Python SDK" /> 5 <orderEntry type="jdk" jdkName="spider" jdkType="Python SDK" />
6 <orderEntry type="sourceFolder" forTests="false" /> 6 <orderEntry type="sourceFolder" forTests="false" />
7 </component> 7 </component>
8 <component name="PackageRequirementsSettings"> 8 <component name="PackageRequirementsSettings">
......
...@@ -3,4 +3,5 @@ ...@@ -3,4 +3,5 @@
3 <component name="Black"> 3 <component name="Black">
4 <option name="sdkName" value="Spider" /> 4 <option name="sdkName" value="Spider" />
5 </component> 5 </component>
6 <component name="ProjectRootManager" version="2" project-jdk-name="spider" project-jdk-type="Python SDK" />
6 </project> 7 </project>
...\ No newline at end of file ...\ No newline at end of file
......
1 {
2 "version": "0.2.0",
3 "configurations": []
4 }
1 {
2 "files.exclude": {
3 "__pycache__": true,
4 "**/*.pyc": true,
5 "**/*.pyo": true,
6 "**/*.pyd": true
7 }
8 }
1 import logging 1 import logging
2 import os 2 import os
3 import sys 3 import sys
4
4 # 把当前文件所在文件夹的父文件夹路径加入到PYTHONPATH 解决ModuleNotFoundError: No module named 'app' 5 # 把当前文件所在文件夹的父文件夹路径加入到PYTHONPATH 解决ModuleNotFoundError: No module named 'app'
5 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 6 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
6 import uvicorn 7 import uvicorn
...@@ -12,9 +13,9 @@ from app.config.log_config import getLogConfig ...@@ -12,9 +13,9 @@ from app.config.log_config import getLogConfig
12 13
13 app = FastAPI(lifespan=lifespan) 14 app = FastAPI(lifespan=lifespan)
14 app.include_router(spider_router.spiderRouter) 15 app.include_router(spider_router.spiderRouter)
15 if __name__ == '__main__': 16 if __name__ == "__main__":
16 appConf = getAppConfig() 17 appConf = getAppConfig()
17 uvicorn.run("main:app", host="0.0.0.0", 18 uvicorn.run(
18 port=appConf.server.port, 19 "main:app", host="0.0.0.0", port=appConf.server.port, log_config=getLogConfig()
19 log_config=getLogConfig()) 20 )
20 logging.info(f"启动成功->{appConf.server}") 21 logging.info(f"启动成功->{appConf.server}")
......
...@@ -10,24 +10,32 @@ class TypeEnum(IntEnum): ...@@ -10,24 +10,32 @@ class TypeEnum(IntEnum):
10 EPISODES = 1 # 剧集 10 EPISODES = 1 # 剧集
11 MOVIE = 2 # 电影 11 MOVIE = 2 # 电影
12 ANIME = 3 # 动漫 12 ANIME = 3 # 动漫
13 VARIETY = 4 # 综艺
13 14
14 def get_precise_positioning(self, other) -> List[str]: 15 def get_precise_positioning(self, other) -> List[str]:
15 if self.ANIME == other: 16 if self.ANIME == other:
16 return ['.cateIcon_5', 17 return [".cateIcon_5", "li.anime-platform:nth-child(1)"]
17 'li.anime-platform:nth-child(1)']
18 if self.MOVIE == other: 18 if self.MOVIE == other:
19 return ['.cateIcon_3', 19 return [".cateIcon_3", "ul.type-box:nth-child(1) > li:nth-child(1)"]
20 'ul.type-box:nth-child(1) > li:nth-child(1)']
21 if self.EPISODES == other: 20 if self.EPISODES == other:
22 return ['.cateIcon_1', 21 return [
23 'div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)', 22 ".cateIcon_1",
24 'ul.type-box:nth-child(1) > li:nth-child(1)'] 23 "div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)",
24 "ul.type-box:nth-child(1) > li:nth-child(1)",
25 ]
26 if self.VARIETY == other:
27 return [
28 "div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)",
29 "ul.type-box:nth-child(1) > li:nth-child(1)",
30 ]
25 31
26 32
27 class GuoDuoSpiderResult(BaseModel): 33 class GuoDuoSpiderResult(BaseModel):
28 # time 数据时间 yyyy-MM-dd 格式的字符窜 34 # time 数据时间 yyyy-MM-dd 格式的字符窜
29 time: str = Field(..., description="数据时间 yyyy-MM-dd 格式的字符", pattern='^\d{4}-\d{2}-\d{2}$') 35 time: str = Field(
30 # type '1剧集 2电影 3 动漫', 36 ..., description="数据时间 yyyy-MM-dd 格式的字符", pattern="^\d{4}-\d{2}-\d{2}$"
37 )
38 # type '1剧集 2电影 3 动漫 4综艺',
31 targetType: TypeEnum = Field(..., description="数据类型") 39 targetType: TypeEnum = Field(..., description="数据类型")
32 # platform 平台名字 40 # platform 平台名字
33 platform: str 41 platform: str
...@@ -43,8 +51,12 @@ class GuoDuoSpiderResult(BaseModel): ...@@ -43,8 +51,12 @@ class GuoDuoSpiderResult(BaseModel):
43 51
44 52
45 class SpiderParams(BaseModel): 53 class SpiderParams(BaseModel):
46 startDate: str = Field(..., description="数据时间 yyyy-MM-dd 格式的字符", pattern='^\d{4}-\d{2}-\d{2}$') 54 startDate: str = Field(
47 endDate: str = Field(..., description="数据时间 yyyy-MM-dd 格式的字符", pattern='^\d{4}-\d{2}-\d{2}$') 55 ..., description="数据时间 yyyy-MM-dd 格式的字符", pattern="^\d{4}-\d{2}-\d{2}$"
56 )
57 endDate: str = Field(
58 ..., description="数据时间 yyyy-MM-dd 格式的字符", pattern="^\d{4}-\d{2}-\d{2}$"
59 )
48 # type '1剧集 2电影 3 动漫', 60 # type '1剧集 2电影 3 动漫',
49 target_type: List[TypeEnum] 61 target_type: List[TypeEnum]
50 # 爬取的地址 62 # 爬取的地址
...@@ -57,12 +69,12 @@ class SpiderParams(BaseModel): ...@@ -57,12 +69,12 @@ class SpiderParams(BaseModel):
57 start_date = self.startDate 69 start_date = self.startDate
58 end_date = self.endDate 70 end_date = self.endDate
59 time_range = [] 71 time_range = []
60 start_date_obj = datetime.strptime(start_date, '%Y-%m-%d') 72 start_date_obj = datetime.strptime(start_date, "%Y-%m-%d")
61 end_date_obj = datetime.strptime(end_date, '%Y-%m-%d') 73 end_date_obj = datetime.strptime(end_date, "%Y-%m-%d")
62 delta = end_date_obj - start_date_obj 74 delta = end_date_obj - start_date_obj
63 for i in range(delta.days + 1): 75 for i in range(delta.days + 1):
64 day = start_date_obj + timedelta(days=i) 76 day = start_date_obj + timedelta(days=i)
65 time_range.append(day.strftime('%Y-%m-%d')) 77 time_range.append(day.strftime("%Y-%m-%d"))
66 return time_range 78 return time_range
67 79
68 80
......
...@@ -5,7 +5,12 @@ from app.model.mysql_model import SpiderModel ...@@ -5,7 +5,12 @@ from app.model.mysql_model import SpiderModel
5 from superstream import Stream 5 from superstream import Stream
6 6
7 from app.job.job import scheduler 7 from app.job.job import scheduler
8 from app.schemas.spider_schema import ApschedulerJob, TaskInfo, SpiderParams, GuoDuoSpiderResult 8 from app.schemas.spider_schema import (
9 ApschedulerJob,
10 TaskInfo,
11 SpiderParams,
12 GuoDuoSpiderResult,
13 )
9 from app.spider.guduo_spider import startBrowser 14 from app.spider.guduo_spider import startBrowser
10 15
11 logger = logging.getLogger(__name__) 16 logger = logging.getLogger(__name__)
...@@ -16,21 +21,29 @@ def add_job(params: List[Any], taskInfo: TaskInfo, task: Any): ...@@ -16,21 +21,29 @@ def add_job(params: List[Any], taskInfo: TaskInfo, task: Any):
16 cron = taskInfo.executeStrategy 21 cron = taskInfo.executeStrategy
17 cron_parameterization = cron.split(" ") 22 cron_parameterization = cron.split(" ")
18 minute = cron_parameterization[1] 23 minute = cron_parameterization[1]
19 minute = (None if minute == "?" else minute) 24 minute = None if minute == "?" else minute
20 hour = cron_parameterization[2] 25 hour = cron_parameterization[2]
21 hour = (None if hour == "?" else hour) 26 hour = None if hour == "?" else hour
22 day = cron_parameterization[3] 27 day = cron_parameterization[3]
23 day = (None if day == "?" else day) 28 day = None if day == "?" else day
24 month = cron_parameterization[4] 29 month = cron_parameterization[4]
25 month = (None if month == "?" else month) 30 month = None if month == "?" else month
26 weekend = cron_parameterization[5] 31 weekend = cron_parameterization[5]
27 weekend = (None if weekend == "?" else weekend) 32 weekend = None if weekend == "?" else weekend
28 job = scheduler.add_job(task, "cron", 33 job = scheduler.add_job(
29 minute=minute, hour=hour, day=day, month=month, week=weekend, 34 task,
30 args=params) 35 "cron",
36 minute=minute,
37 hour=hour,
38 day=day,
39 month=month,
40 week=weekend,
41 args=params,
42 )
31 jobs.append(ApschedulerJob(jobId=job, taskId=taskInfo.taskId)) 43 jobs.append(ApschedulerJob(jobId=job, taskId=taskInfo.taskId))
32 logger.info( 44 logger.info(
33 f"添加任务成功,任务id:{taskInfo.taskId},任务执行策略:{taskInfo.executeStrategy}") 45 f"添加任务成功,任务id:{taskInfo.taskId},任务执行策略:{taskInfo.executeStrategy}"
46 )
34 47
35 48
36 def add_guduo_job(taskParam: SpiderParams, taskInfo: TaskInfo): 49 def add_guduo_job(taskParam: SpiderParams, taskInfo: TaskInfo):
...@@ -52,18 +65,17 @@ def delete_job(taskId: int): ...@@ -52,18 +65,17 @@ def delete_job(taskId: int):
52 65
53 66
54 def get_job_info(taskId: int): 67 def get_job_info(taskId: int):
55 job = (Stream(jobs) 68 job = Stream(jobs).filter(lambda x: x.taskId == taskId).find_first()
56 .filter(lambda x: x.taskId == taskId) 69 return f"job 信息->{job}"
57 .find_first())
58 return f'job 信息->{job}'
59 70
60 71
61 async def scrawl_and_save(taskParam: SpiderParams): 72 async def scrawl_and_save(taskParam: SpiderParams):
62 # 执行爬虫获取结果 73 # 执行爬虫获取结果
63 results = await startBrowser(taskParam) 74 results = await startBrowser(taskParam)
75 logger.info(f"爬虫重试情况:{startBrowser.statistics}")
64 asyncTasks = (save_or_update(item) for item in results) 76 asyncTasks = (save_or_update(item) for item in results)
65 await asyncio.gather(*asyncTasks) 77 await asyncio.gather(*asyncTasks)
66 logger.info(f'爬虫任务执行完成,爬取到数据{len(results)}条 保存到数据库完成') 78 logger.info(f"爬虫任务执行完成,爬取到数据{len(results)}条 保存到数据库完成")
67 79
68 80
69 async def save_or_update(result: GuoDuoSpiderResult): 81 async def save_or_update(result: GuoDuoSpiderResult):
...@@ -75,10 +87,7 @@ async def save_or_update(result: GuoDuoSpiderResult): ...@@ -75,10 +87,7 @@ async def save_or_update(result: GuoDuoSpiderResult):
75 createTime = result.createTime 87 createTime = result.createTime
76 # 使用SpiderModel 查询 time targetType platform sourceName 等于以上的值是否存在如果存在就更新不存在就插入 88 # 使用SpiderModel 查询 time targetType platform sourceName 等于以上的值是否存在如果存在就更新不存在就插入
77 obj = await SpiderModel.get_or_none( 89 obj = await SpiderModel.get_or_none(
78 time=time, 90 time=time, targetType=targetType, platform=platform, sourceName=sourceName
79 targetType=targetType,
80 platform=platform,
81 sourceName=sourceName
82 ) 91 )
83 if obj: 92 if obj:
84 for key, value in result.__dict__.items(): 93 for key, value in result.__dict__.items():
...@@ -90,6 +99,6 @@ async def save_or_update(result: GuoDuoSpiderResult): ...@@ -90,6 +99,6 @@ async def save_or_update(result: GuoDuoSpiderResult):
90 platform=platform, 99 platform=platform,
91 sourceName=sourceName, 100 sourceName=sourceName,
92 score=score, 101 score=score,
93 createTime=createTime 102 createTime=createTime,
94 ) 103 )
95 await obj.save() 104 await obj.save()
......
...@@ -7,10 +7,24 @@ from superstream import Stream ...@@ -7,10 +7,24 @@ from superstream import Stream
7 from app.schemas.spider_schema import GuoDuoSpiderResult, SpiderParams, TypeEnum 7 from app.schemas.spider_schema import GuoDuoSpiderResult, SpiderParams, TypeEnum
8 import logging 8 import logging
9 from tqdm.asyncio import tqdm_asyncio 9 from tqdm.asyncio import tqdm_asyncio
10 from tenacity import (
11 after_log,
12 before_sleep_log,
13 retry,
14 stop_after_attempt,
15 wait_exponential,
16 wait_fixed,
17 )
10 18
11 logger = logging.getLogger(__name__) 19 logger = logging.getLogger(__name__)
12 20
13 21
22 @retry(
23 stop=stop_after_attempt(3),
24 before_sleep=before_sleep_log(logger, logging.DEBUG),
25 wait=wait_exponential(multiplier=1, min=1, max=10),
26 after=after_log(logger, logging.DEBUG),
27 )
14 async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: 28 async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]:
15 time_range = spiderParam.get_time_range() 29 time_range = spiderParam.get_time_range()
16 url = spiderParam.url 30 url = spiderParam.url
...@@ -18,34 +32,41 @@ async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: ...@@ -18,34 +32,41 @@ async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]:
18 async with async_playwright() as p: 32 async with async_playwright() as p:
19 browser = await p.firefox.launch(headless=True) 33 browser = await p.firefox.launch(headless=True)
20 # 创建指定个数的浏览器页面 34 # 创建指定个数的浏览器页面
21 pages = await tqdm_asyncio.gather(*(browser.new_page() 35 pages = await tqdm_asyncio.gather(
22 for _ in range(len(time_range)))) 36 *(browser.new_page() for _ in range(len(time_range)))
37 )
23 # 同时循环time_range与pages 去调用hand_one_data异步方法 38 # 同时循环time_range与pages 去调用hand_one_data异步方法
24 results = await tqdm_asyncio.gather(*(hand_one_data(time_range[i], pages[i], 39 results = await tqdm_asyncio.gather(
25 scrawl_types, url) 40 *(
26 for i in range(len(time_range)))) 41 hand_one_data(time_range[i], pages[i], scrawl_types, url)
27 return (Stream(results) 42 for i in range(len(time_range))
28 .filter(lambda x: x is not None) 43 )
29 .filter(lambda x: len(x) > 0) 44 )
30 .flat_map(lambda x: Stream(x)) 45 return (
31 .to_list()) 46 Stream(results)
47 .filter(lambda x: x is not None)
48 .filter(lambda x: len(x) > 0)
49 .flat_map(lambda x: Stream(x))
50 .to_list()
51 )
32 52
33 53
34 async def hand_one_data(targetDate: str, 54 async def hand_one_data(
35 page: Page, 55 targetDate: str, page: Page, scrawl_types: List[TypeEnum], url: str
36 scrawl_types: List[TypeEnum], 56 ) -> List[GuoDuoSpiderResult]:
37 url: str) -> List[GuoDuoSpiderResult]:
38 # 把date这个日期对象解析为年、月、日 57 # 把date这个日期对象解析为年、月、日
39 year, month, day = targetDate.split('-') 58 year, month, day = targetDate.split("-")
40 day = day.replace('0', '') 59 day = day.replace("0", "")
41 month = int(month) - 1 60 month = int(month) - 1
42 await page.goto(url) 61 await page.goto(url)
43 # 等待遮罩层消失 62 # 等待遮罩层消失
44 await page.click('//*[@id="currentDateTxt"]') 63 await page.click('//*[@id="currentDateTxt"]')
45 data_css_selector = (f'#day span.pika-button.pika-day[type="button"]' 64 data_css_selector = (
46 f'[data-pika-year="{year}"]' 65 f'#day span.pika-button.pika-day[type="button"]'
47 f'[data-pika-month="{month}"]' 66 f'[data-pika-year="{year}"]'
48 f'[data-pika-day="{day}"]') 67 f'[data-pika-month="{month}"]'
68 f'[data-pika-day="{day}"]'
69 )
49 doc = page.locator(data_css_selector) 70 doc = page.locator(data_css_selector)
50 # 判断指定元素是否存在如果不存在就返回空的[] 71 # 判断指定元素是否存在如果不存在就返回空的[]
51 if not await doc.is_visible(): 72 if not await doc.is_visible():
...@@ -53,12 +74,14 @@ async def hand_one_data(targetDate: str, ...@@ -53,12 +74,14 @@ async def hand_one_data(targetDate: str,
53 # 点击指定日期 74 # 点击指定日期
54 await doc.click() 75 await doc.click()
55 # 最后一步修正定位 76 # 最后一步修正定位
56 css_selectors = (Stream(scrawl_types) 77 css_selectors = (
57 .map(lambda x: (x, x.get_precise_positioning(x))) 78 Stream(scrawl_types)
58 .group_by(lambda x: x[0])) 79 .map(lambda x: (x, x.get_precise_positioning(x)))
80 .group_by(lambda x: x[0])
81 )
59 results = [] 82 results = []
60 for key, value in css_selectors.items(): 83 for key, value in css_selectors.items():
61 logger.info(f'开始爬取 {targetDate} 类型是{value[0][0]} 的数据') 84 logger.info(f"开始爬取 {targetDate} 类型是{value[0][0]} 的数据")
62 for css_selector in value[0][1]: 85 for css_selector in value[0][1]:
63 await page.click(css_selector) 86 await page.click(css_selector)
64 result = await get_data(page, key.value, targetDate) 87 result = await get_data(page, key.value, targetDate)
...@@ -67,39 +90,53 @@ async def hand_one_data(targetDate: str, ...@@ -67,39 +90,53 @@ async def hand_one_data(targetDate: str,
67 return results 90 return results
68 91
69 92
70 async def get_data(targetPage: Page, scrawl_type: int, target_time: str) -> Optional[List[GuoDuoSpiderResult]]: 93 async def get_data(
71 target_data_css_selector = '.rank-box' 94 targetPage: Page, scrawl_type: int, target_time: str
95 ) -> Optional[List[GuoDuoSpiderResult]]:
96 target_data_css_selector = ".rank-box"
72 table = targetPage.locator(target_data_css_selector) 97 table = targetPage.locator(target_data_css_selector)
73 if table is None: 98 if table is None:
74 print(f'当前页面获取table为空:{scrawl_type}') 99 print(f"当前页面获取table为空:{scrawl_type}")
75 return None 100 return None
76 # 获取当前这个doc_obj元素下面所有的tr标签 101 # 获取当前这个doc_obj元素下面所有的tr标签
77 trs = table.locator('tr') 102 trs = table.locator("tr")
78 if trs is None: 103 if trs is None:
79 print(f'当前页面获取tr为空:{scrawl_type}') 104 print(f"当前页面获取tr为空:{scrawl_type}")
80 # 循环遍历trs下面的每个tr标签下面的td标签 105 # 循环遍历trs下面的每个tr标签下面的td标签
81 at = await trs.all_text_contents() 106 at = await trs.all_text_contents()
82 result = (Stream(at) 107 result = (
83 .filter(lambda x: x.strip() != '') 108 Stream(at)
84 .filter(lambda x: len(x.split(' ')) > 6) 109 .filter(lambda x: x.strip() != "")
85 .map(lambda x: GuoDuoSpiderResult(time=target_time, 110 .filter(lambda x: len(x.split(" ")) > 6)
86 targetType=TypeEnum(scrawl_type), 111 .map(
87 createTime=datetime.now(), 112 lambda x: GuoDuoSpiderResult(
88 platform='all', 113 time=target_time,
89 score=float(x.split(' ')[-1]), 114 targetType=TypeEnum(scrawl_type),
90 sourceName=x.split(' ')[5])) 115 createTime=datetime.now(),
91 .to_list()) 116 platform="all",
117 score=float(x.split(" ")[-1]),
118 sourceName=x.split(" ")[5],
119 )
120 )
121 .to_list()
122 )
92 return result 123 return result
93 124
94 125
95 if __name__ == '__main__': 126 if __name__ == "__main__":
96 # 获取程序开始执行的时间 127 # 获取程序开始执行的时间
97 start_time = time.time() 128 start_time = time.time()
98 param = SpiderParams(startDate='2024-01-02', 129 param = SpiderParams(
99 endDate='2024-01-15', 130 startDate="2024-01-02",
100 target_type=[TypeEnum.ANIME, TypeEnum.EPISODES, TypeEnum.MOVIE] 131 endDate="2024-01-15",
101 , url='http://guduodata.com/') 132 target_type=[
133 TypeEnum.ANIME,
134 TypeEnum.EPISODES,
135 TypeEnum.MOVIE,
136 TypeEnum.VARIETY,
137 ],
138 url="http://guduodata.com/",
139 )
102 res = asyncio.run(startBrowser(param)) 140 res = asyncio.run(startBrowser(param))
103 print(f'程序执行耗时时间:{(time.time() - start_time)/1000}长度为:{len(res)}') 141 print(f"代码重试情况:{startBrowser.statistics}")
104 for it in res: 142 print(f"程序执行耗时时间:{(time.time() - start_time)/1000}长度为:{len(res)}")
105 print(it)
......
1 aiosqlite==0.20.0
2 annotated-types==0.7.0
3 anyio==4.7.0
4 APScheduler==3.11.0
5 asyncmy==0.2.10
6 click==8.1.7
7 colorama==0.4.6
8 distlib==0.3.8
9 exceptiongroup==1.2.2
10 fastapi==0.115.6
11 greenlet==3.1.1
12 h11==0.14.0
13 idna==3.10
14 iso8601==2.1.0
15 pandoc==2.3
16 playwright==1.49.1
17 plumbum==1.8.3
18 ply==3.11
19 pydantic==2.10.4
20 pydantic_core==2.27.2
21 pyee==12.0.0
22 pypika-tortoise==0.3.2
23 pytz==2024.2
24 pywin32==306
25 PyYAML==6.0.2
26 sniffio==1.3.1
27 starlette==0.41.3
28 SuperStream==0.2.6
29 tenacity==9.0.0
30 tortoise-orm==0.22.2
31 tqdm==4.67.1
32 typing_extensions==4.12.2
33 tzdata==2024.2
34 tzlocal==5.2
35 uvicorn==0.34.0
36 virtualenv==20.25.0