Commit 411051ea 411051eabdafe26ca50096c1b5a77230554cbcbd by wenxin
2 parents 39153b19 2c400f99
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
2 <module type="PYTHON_MODULE" version="4"> 2 <module type="PYTHON_MODULE" version="4">
3 <component name="NewModuleRootManager"> 3 <component name="NewModuleRootManager">
4 <content url="file://$MODULE_DIR$" /> 4 <content url="file://$MODULE_DIR$" />
5 <orderEntry type="jdk" jdkName="Spider" jdkType="Python SDK" /> 5 <orderEntry type="jdk" jdkName="spider" jdkType="Python SDK" />
6 <orderEntry type="sourceFolder" forTests="false" /> 6 <orderEntry type="sourceFolder" forTests="false" />
7 </component> 7 </component>
8 <component name="PackageRequirementsSettings"> 8 <component name="PackageRequirementsSettings">
......
...@@ -3,4 +3,5 @@ ...@@ -3,4 +3,5 @@
3 <component name="Black"> 3 <component name="Black">
4 <option name="sdkName" value="Spider" /> 4 <option name="sdkName" value="Spider" />
5 </component> 5 </component>
6 <component name="ProjectRootManager" version="2" project-jdk-name="spider" project-jdk-type="Python SDK" />
6 </project> 7 </project>
...\ No newline at end of file ...\ No newline at end of file
......
1 import logging 1 import logging
2 import os 2 import os
3 import sys 3 import sys
4
4 # 把当前文件所在文件夹的父文件夹路径加入到PYTHONPATH 解决ModuleNotFoundError: No module named 'app' 5 # 把当前文件所在文件夹的父文件夹路径加入到PYTHONPATH 解决ModuleNotFoundError: No module named 'app'
5 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 6 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
6 import uvicorn 7 import uvicorn
...@@ -12,9 +13,9 @@ from app.config.log_config import getLogConfig ...@@ -12,9 +13,9 @@ from app.config.log_config import getLogConfig
12 13
13 app = FastAPI(lifespan=lifespan) 14 app = FastAPI(lifespan=lifespan)
14 app.include_router(spider_router.spiderRouter) 15 app.include_router(spider_router.spiderRouter)
15 if __name__ == '__main__': 16 if __name__ == "__main__":
16 appConf = getAppConfig() 17 appConf = getAppConfig()
17 uvicorn.run("main:app", host="0.0.0.0", 18 uvicorn.run(
18 port=appConf.server.port, 19 "main:app", host="0.0.0.0", port=appConf.server.port, log_config=getLogConfig()
19 log_config=getLogConfig()) 20 )
20 logging.info(f"启动成功->{appConf.server}") 21 logging.info(f"启动成功->{appConf.server}")
......
...@@ -13,6 +13,6 @@ class SpiderModel(Model): ...@@ -13,6 +13,6 @@ class SpiderModel(Model):
13 sourceName = fields.CharField(max_length=255, description="媒体资源名字", source_field='source_name') 13 sourceName = fields.CharField(max_length=255, description="媒体资源名字", source_field='source_name')
14 score = fields.FloatField(description="热度得分") 14 score = fields.FloatField(description="热度得分")
15 createTime = fields.DatetimeField(default=datetime.now, description="创建时间", source_field='create_time') 15 createTime = fields.DatetimeField(default=datetime.now, description="创建时间", source_field='create_time')
16 16 updateTime = fields.DatetimeField(default=datetime.now, description="更新时间", source_field='update_time')
17 class Meta: 17 class Meta:
18 table = "spider_data" 18 table = "spider_data"
......
...@@ -10,24 +10,32 @@ class TypeEnum(IntEnum): ...@@ -10,24 +10,32 @@ class TypeEnum(IntEnum):
10 EPISODES = 1 # 剧集 10 EPISODES = 1 # 剧集
11 MOVIE = 2 # 电影 11 MOVIE = 2 # 电影
12 ANIME = 3 # 动漫 12 ANIME = 3 # 动漫
13 VARIETY = 4 # 综艺
13 14
14 def get_precise_positioning(self, other) -> List[str]: 15 def get_precise_positioning(self, other) -> List[str]:
15 if self.ANIME == other: 16 if self.ANIME == other:
16 return ['.cateIcon_5', 17 return [".cateIcon_5", "li.anime-platform:nth-child(1)"]
17 'li.anime-platform:nth-child(1)']
18 if self.MOVIE == other: 18 if self.MOVIE == other:
19 return ['.cateIcon_3', 19 return [".cateIcon_3", "ul.type-box:nth-child(1) > li:nth-child(1)"]
20 'ul.type-box:nth-child(1) > li:nth-child(1)']
21 if self.EPISODES == other: 20 if self.EPISODES == other:
22 return ['.cateIcon_1', 21 return [
23 'div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)', 22 ".cateIcon_1",
24 'ul.type-box:nth-child(1) > li:nth-child(1)'] 23 "div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)",
24 "ul.type-box:nth-child(1) > li:nth-child(1)",
25 ]
26 if self.VARIETY == other:
27 return [
28 "div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)",
29 "ul.type-box:nth-child(1) > li:nth-child(1)",
30 ]
25 31
26 32
27 class GuoDuoSpiderResult(BaseModel): 33 class GuoDuoSpiderResult(BaseModel):
28 # time 数据时间 yyyy-MM-dd 格式的字符窜 34 # time 数据时间 yyyy-MM-dd 格式的字符窜
29 time: str = Field(..., description="数据时间 yyyy-MM-dd 格式的字符", pattern='^\d{4}-\d{2}-\d{2}$') 35 time: str = Field(
30 # type '1剧集 2电影 3 动漫', 36 ..., description="数据时间 yyyy-MM-dd 格式的字符", pattern="^\d{4}-\d{2}-\d{2}$"
37 )
38 # type '1剧集 2电影 3 动漫 4综艺',
31 targetType: TypeEnum = Field(..., description="数据类型") 39 targetType: TypeEnum = Field(..., description="数据类型")
32 # platform 平台名字 40 # platform 平台名字
33 platform: str 41 platform: str
...@@ -43,8 +51,12 @@ class GuoDuoSpiderResult(BaseModel): ...@@ -43,8 +51,12 @@ class GuoDuoSpiderResult(BaseModel):
43 51
44 52
45 class SpiderParams(BaseModel): 53 class SpiderParams(BaseModel):
46 startDate: str = Field(..., description="数据时间 yyyy-MM-dd 格式的字符", pattern='^\d{4}-\d{2}-\d{2}$') 54 startDate: str = Field(
47 endDate: str = Field(..., description="数据时间 yyyy-MM-dd 格式的字符", pattern='^\d{4}-\d{2}-\d{2}$') 55 ..., description="数据时间 yyyy-MM-dd 格式的字符", pattern="^\d{4}-\d{2}-\d{2}$"
56 )
57 endDate: str = Field(
58 ..., description="数据时间 yyyy-MM-dd 格式的字符", pattern="^\d{4}-\d{2}-\d{2}$"
59 )
48 # type '1剧集 2电影 3 动漫', 60 # type '1剧集 2电影 3 动漫',
49 target_type: List[TypeEnum] 61 target_type: List[TypeEnum]
50 # 爬取的地址 62 # 爬取的地址
...@@ -57,12 +69,12 @@ class SpiderParams(BaseModel): ...@@ -57,12 +69,12 @@ class SpiderParams(BaseModel):
57 start_date = self.startDate 69 start_date = self.startDate
58 end_date = self.endDate 70 end_date = self.endDate
59 time_range = [] 71 time_range = []
60 start_date_obj = datetime.strptime(start_date, '%Y-%m-%d') 72 start_date_obj = datetime.strptime(start_date, "%Y-%m-%d")
61 end_date_obj = datetime.strptime(end_date, '%Y-%m-%d') 73 end_date_obj = datetime.strptime(end_date, "%Y-%m-%d")
62 delta = end_date_obj - start_date_obj 74 delta = end_date_obj - start_date_obj
63 for i in range(delta.days + 1): 75 for i in range(delta.days + 1):
64 day = start_date_obj + timedelta(days=i) 76 day = start_date_obj + timedelta(days=i)
65 time_range.append(day.strftime('%Y-%m-%d')) 77 time_range.append(day.strftime("%Y-%m-%d"))
66 return time_range 78 return time_range
67 79
68 80
......
...@@ -3,9 +3,13 @@ import logging ...@@ -3,9 +3,13 @@ import logging
3 from typing import List, Any 3 from typing import List, Any
4 from app.model.mysql_model import SpiderModel 4 from app.model.mysql_model import SpiderModel
5 from superstream import Stream 5 from superstream import Stream
6
7 from app.job.job import scheduler 6 from app.job.job import scheduler
8 from app.schemas.spider_schema import ApschedulerJob, TaskInfo, SpiderParams, GuoDuoSpiderResult 7 from app.schemas.spider_schema import (
8 ApschedulerJob,
9 TaskInfo,
10 SpiderParams,
11 GuoDuoSpiderResult,
12 )
9 from app.spider.guduo_spider import startBrowser 13 from app.spider.guduo_spider import startBrowser
10 14
11 logger = logging.getLogger(__name__) 15 logger = logging.getLogger(__name__)
...@@ -16,21 +20,29 @@ def add_job(params: List[Any], taskInfo: TaskInfo, task: Any): ...@@ -16,21 +20,29 @@ def add_job(params: List[Any], taskInfo: TaskInfo, task: Any):
16 cron = taskInfo.executeStrategy 20 cron = taskInfo.executeStrategy
17 cron_parameterization = cron.split(" ") 21 cron_parameterization = cron.split(" ")
18 minute = cron_parameterization[1] 22 minute = cron_parameterization[1]
19 minute = (None if minute == "?" else minute) 23 minute = None if minute == "?" else minute
20 hour = cron_parameterization[2] 24 hour = cron_parameterization[2]
21 hour = (None if hour == "?" else hour) 25 hour = None if hour == "?" else hour
22 day = cron_parameterization[3] 26 day = cron_parameterization[3]
23 day = (None if day == "?" else day) 27 day = None if day == "?" else day
24 month = cron_parameterization[4] 28 month = cron_parameterization[4]
25 month = (None if month == "?" else month) 29 month = None if month == "?" else month
26 weekend = cron_parameterization[5] 30 weekend = cron_parameterization[5]
27 weekend = (None if weekend == "?" else weekend) 31 weekend = None if weekend == "?" else weekend
28 job = scheduler.add_job(task, "cron", 32 job = scheduler.add_job(
29 minute=minute, hour=hour, day=day, month=month, week=weekend, 33 task,
30 args=params) 34 "cron",
35 minute=minute,
36 hour=hour,
37 day=day,
38 month=month,
39 week=weekend,
40 args=params,
41 )
31 jobs.append(ApschedulerJob(jobId=job, taskId=taskInfo.taskId)) 42 jobs.append(ApschedulerJob(jobId=job, taskId=taskInfo.taskId))
32 logger.info( 43 logger.info(
33 f"添加任务成功,任务id:{taskInfo.taskId},任务执行策略:{taskInfo.executeStrategy}") 44 f"添加任务成功,任务id:{taskInfo.taskId},任务执行策略:{taskInfo.executeStrategy}"
45 )
34 46
35 47
36 def add_guduo_job(taskParam: SpiderParams, taskInfo: TaskInfo): 48 def add_guduo_job(taskParam: SpiderParams, taskInfo: TaskInfo):
...@@ -52,18 +64,17 @@ def delete_job(taskId: int): ...@@ -52,18 +64,17 @@ def delete_job(taskId: int):
52 64
53 65
54 def get_job_info(taskId: int): 66 def get_job_info(taskId: int):
55 job = (Stream(jobs) 67 job = Stream(jobs).filter(lambda x: x.taskId == taskId).find_first()
56 .filter(lambda x: x.taskId == taskId) 68 return f"job 信息->{job}"
57 .find_first())
58 return f'job 信息->{job}'
59 69
60 70
61 async def scrawl_and_save(taskParam: SpiderParams): 71 async def scrawl_and_save(taskParam: SpiderParams):
62 # 执行爬虫获取结果 72 # 执行爬虫获取结果
63 results = await startBrowser(taskParam) 73 results = await startBrowser(taskParam)
74 logger.info(f"爬虫重试情况:{startBrowser.statistics}")
64 asyncTasks = (save_or_update(item) for item in results) 75 asyncTasks = (save_or_update(item) for item in results)
65 await asyncio.gather(*asyncTasks) 76 await asyncio.gather(*asyncTasks)
66 logger.info(f'爬虫任务执行完成,爬取到数据{len(results)}条 保存到数据库完成') 77 logger.info(f"爬虫任务执行完成,爬取到数据{len(results)}条 保存到数据库完成")
67 78
68 79
69 async def save_or_update(result: GuoDuoSpiderResult): 80 async def save_or_update(result: GuoDuoSpiderResult):
...@@ -75,10 +86,7 @@ async def save_or_update(result: GuoDuoSpiderResult): ...@@ -75,10 +86,7 @@ async def save_or_update(result: GuoDuoSpiderResult):
75 createTime = result.createTime 86 createTime = result.createTime
76 # 使用SpiderModel 查询 time targetType platform sourceName 等于以上的值是否存在如果存在就更新不存在就插入 87 # 使用SpiderModel 查询 time targetType platform sourceName 等于以上的值是否存在如果存在就更新不存在就插入
77 obj = await SpiderModel.get_or_none( 88 obj = await SpiderModel.get_or_none(
78 time=time, 89 time=time, targetType=targetType, platform=platform, sourceName=sourceName
79 targetType=targetType,
80 platform=platform,
81 sourceName=sourceName
82 ) 90 )
83 if obj: 91 if obj:
84 for key, value in result.__dict__.items(): 92 for key, value in result.__dict__.items():
...@@ -90,6 +98,6 @@ async def save_or_update(result: GuoDuoSpiderResult): ...@@ -90,6 +98,6 @@ async def save_or_update(result: GuoDuoSpiderResult):
90 platform=platform, 98 platform=platform,
91 sourceName=sourceName, 99 sourceName=sourceName,
92 score=score, 100 score=score,
93 createTime=createTime 101 createTime=createTime,
94 ) 102 )
95 await obj.save() 103 await obj.save()
......
...@@ -6,11 +6,25 @@ from playwright.async_api import Page, async_playwright ...@@ -6,11 +6,25 @@ from playwright.async_api import Page, async_playwright
6 from superstream import Stream 6 from superstream import Stream
7 from app.schemas.spider_schema import GuoDuoSpiderResult, SpiderParams, TypeEnum 7 from app.schemas.spider_schema import GuoDuoSpiderResult, SpiderParams, TypeEnum
8 import logging 8 import logging
9 from tqdm.asyncio import tqdm 9 from tqdm.asyncio import tqdm_asyncio
10 from tenacity import (
11 after_log,
12 before_sleep_log,
13 retry,
14 stop_after_attempt,
15 wait_exponential,
16 wait_fixed,
17 )
10 18
11 logger = logging.getLogger(__name__) 19 logger = logging.getLogger(__name__)
12 20
13 21
22 @retry(
23 stop=stop_after_attempt(3),
24 before_sleep=before_sleep_log(logger, logging.DEBUG),
25 wait=wait_exponential(multiplier=1, min=1, max=10),
26 after=after_log(logger, logging.DEBUG),
27 )
14 async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: 28 async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]:
15 time_range = spiderParam.get_time_range() 29 time_range = spiderParam.get_time_range()
16 url = spiderParam.url 30 url = spiderParam.url
...@@ -18,16 +32,16 @@ async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: ...@@ -18,16 +32,16 @@ async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]:
18 async with async_playwright() as p: 32 async with async_playwright() as p:
19 browser = await p.firefox.launch(headless=True) 33 browser = await p.firefox.launch(headless=True)
20 # 创建指定个数的浏览器页面 34 # 创建指定个数的浏览器页面
21 page_tasks = (browser.new_page() for _ in range(len(time_range))) 35 pages = await tqdm_asyncio.gather(
22 pages_generator = tqdm.as_completed(page_tasks) 36 *(browser.new_page() for _ in range(len(time_range)))
23 pages = list(pages_generator) 37 )
24 # 同时循环time_range与pages 去调用hand_one_data异步方法 38 # 同时循环time_range与pages 去调用hand_one_data异步方法
25 hand_result_tasks = ( 39 results = await tqdm_asyncio.gather(
26 hand_one_data(time_range[i], pages[i], scrawl_types, url) 40 *(
27 for i in range(len(time_range)) 41 hand_one_data(time_range[i], pages[i], scrawl_types, url)
42 for i in range(len(time_range))
43 )
28 ) 44 )
29 results__generator = tqdm.as_completed(hand_result_tasks)
30 results = list(results__generator)
31 return ( 45 return (
32 Stream(results) 46 Stream(results)
33 .filter(lambda x: x is not None) 47 .filter(lambda x: x is not None)
...@@ -38,7 +52,7 @@ async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: ...@@ -38,7 +52,7 @@ async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]:
38 52
39 53
40 async def hand_one_data( 54 async def hand_one_data(
41 targetDate: str, page: Page, scrawl_types: List[TypeEnum], url: str 55 targetDate: str, page: Page, scrawl_types: List[TypeEnum], url: str
42 ) -> List[GuoDuoSpiderResult]: 56 ) -> List[GuoDuoSpiderResult]:
43 # 把date这个日期对象解析为年、月、日 57 # 把date这个日期对象解析为年、月、日
44 year, month, day = targetDate.split("-") 58 year, month, day = targetDate.split("-")
...@@ -77,17 +91,17 @@ async def hand_one_data( ...@@ -77,17 +91,17 @@ async def hand_one_data(
77 91
78 92
79 async def get_data( 93 async def get_data(
80 targetPage: Page, scrawl_type: int, target_time: str 94 targetPage: Page, scrawl_type: int, target_time: str
81 ) -> Optional[List[GuoDuoSpiderResult]]: 95 ) -> Optional[List[GuoDuoSpiderResult]]:
82 target_data_css_selector = ".rank-box" 96 target_data_css_selector = ".rank-box"
83 table = targetPage.locator(target_data_css_selector) 97 table = targetPage.locator(target_data_css_selector)
84 if table is None: 98 if table is None:
85 logger.info(f"当前页面获取table为空:{scrawl_type}") 99 print(f"当前页面获取table为空:{scrawl_type}")
86 return None 100 return None
87 # 获取当前这个doc_obj元素下面所有的tr标签 101 # 获取当前这个doc_obj元素下面所有的tr标签
88 trs = table.locator("tr") 102 trs = table.locator("tr")
89 if trs is None: 103 if trs is None:
90 logger.info(f"当前页面获取tr为空:{scrawl_type}") 104 print(f"当前页面获取tr为空:{scrawl_type}")
91 # 循环遍历trs下面的每个tr标签下面的td标签 105 # 循环遍历trs下面的每个tr标签下面的td标签
92 at = await trs.all_text_contents() 106 at = await trs.all_text_contents()
93 result = ( 107 result = (
...@@ -113,12 +127,16 @@ if __name__ == "__main__": ...@@ -113,12 +127,16 @@ if __name__ == "__main__":
113 # 获取程序开始执行的时间 127 # 获取程序开始执行的时间
114 start_time = time.time() 128 start_time = time.time()
115 param = SpiderParams( 129 param = SpiderParams(
116 startDate="1991-01-02", 130 startDate="2024-01-02",
117 endDate="1991-01-05", 131 endDate="2024-01-15",
118 target_type=[TypeEnum.ANIME, TypeEnum.EPISODES, TypeEnum.MOVIE], 132 target_type=[
133 TypeEnum.ANIME,
134 TypeEnum.EPISODES,
135 TypeEnum.MOVIE,
136 TypeEnum.VARIETY,
137 ],
119 url="http://guduodata.com/", 138 url="http://guduodata.com/",
120 ) 139 )
121 res = asyncio.run(startBrowser(param)) 140 res = asyncio.run(startBrowser(param))
122 print(f"程序执行耗时时间:{time.time() - start_time} 长度为:{len(res)}") 141 print(f"代码重试情况:{startBrowser.statistics}")
123 for it in res: 142 print(f"程序执行耗时时间:{(time.time() - start_time)/1000}长度为:{len(res)}")
124 print(it)
......
...@@ -4,24 +4,33 @@ anyio==4.7.0 ...@@ -4,24 +4,33 @@ anyio==4.7.0
4 APScheduler==3.11.0 4 APScheduler==3.11.0
5 asyncmy==0.2.10 5 asyncmy==0.2.10
6 click==8.1.7 6 click==8.1.7
7 colorama==0.4.6
8 distlib==0.3.8
7 exceptiongroup==1.2.2 9 exceptiongroup==1.2.2
8 fastapi==0.115.6 10 fastapi==0.115.6
9 greenlet==3.1.1 11 greenlet==3.1.1
10 h11==0.14.0 12 h11==0.14.0
11 idna==3.10 13 idna==3.10
12 iso8601==2.1.0 14 iso8601==2.1.0
15 pandoc==2.3
13 playwright==1.49.1 16 playwright==1.49.1
17 plumbum==1.8.3
18 ply==3.11
14 pydantic==2.10.4 19 pydantic==2.10.4
15 pydantic_core==2.27.2 20 pydantic_core==2.27.2
16 pyee==12.0.0 21 pyee==12.0.0
17 pypika-tortoise==0.3.2 22 pypika-tortoise==0.3.2
18 pytz==2024.2 23 pytz==2024.2
24 pywin32==306
19 PyYAML==6.0.2 25 PyYAML==6.0.2
20 sniffio==1.3.1 26 sniffio==1.3.1
21 starlette==0.41.3 27 starlette==0.41.3
22 SuperStream==0.2.6 28 SuperStream==0.2.6
29 tenacity==9.0.0
23 tortoise-orm==0.22.2 30 tortoise-orm==0.22.2
24 tqdm==4.67.1 31 tqdm==4.67.1
25 typing_extensions==4.12.2 32 typing_extensions==4.12.2
33 tzdata==2024.2
26 tzlocal==5.2 34 tzlocal==5.2
27 uvicorn==0.34.0 35 uvicorn==0.34.0
36 virtualenv==20.25.0
......