bug修复
Showing
9 changed files
with
186 additions
and
78 deletions
... | @@ -2,7 +2,7 @@ | ... | @@ -2,7 +2,7 @@ |
2 | <module type="PYTHON_MODULE" version="4"> | 2 | <module type="PYTHON_MODULE" version="4"> |
3 | <component name="NewModuleRootManager"> | 3 | <component name="NewModuleRootManager"> |
4 | <content url="file://$MODULE_DIR$" /> | 4 | <content url="file://$MODULE_DIR$" /> |
5 | <orderEntry type="jdk" jdkName="Spider" jdkType="Python SDK" /> | 5 | <orderEntry type="jdk" jdkName="spider" jdkType="Python SDK" /> |
6 | <orderEntry type="sourceFolder" forTests="false" /> | 6 | <orderEntry type="sourceFolder" forTests="false" /> |
7 | </component> | 7 | </component> |
8 | <component name="PackageRequirementsSettings"> | 8 | <component name="PackageRequirementsSettings"> | ... | ... |
... | @@ -3,4 +3,5 @@ | ... | @@ -3,4 +3,5 @@ |
3 | <component name="Black"> | 3 | <component name="Black"> |
4 | <option name="sdkName" value="Spider" /> | 4 | <option name="sdkName" value="Spider" /> |
5 | </component> | 5 | </component> |
6 | <component name="ProjectRootManager" version="2" project-jdk-name="spider" project-jdk-type="Python SDK" /> | ||
6 | </project> | 7 | </project> |
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
.vscode/launch.json
0 → 100644
.vscode/settings.json
0 → 100644
1 | import logging | 1 | import logging |
2 | import os | 2 | import os |
3 | import sys | 3 | import sys |
4 | |||
4 | # 把当前文件所在文件夹的父文件夹路径加入到PYTHONPATH 解决ModuleNotFoundError: No module named 'app' | 5 | # 把当前文件所在文件夹的父文件夹路径加入到PYTHONPATH 解决ModuleNotFoundError: No module named 'app' |
5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
6 | import uvicorn | 7 | import uvicorn |
... | @@ -12,9 +13,9 @@ from app.config.log_config import getLogConfig | ... | @@ -12,9 +13,9 @@ from app.config.log_config import getLogConfig |
12 | 13 | ||
13 | app = FastAPI(lifespan=lifespan) | 14 | app = FastAPI(lifespan=lifespan) |
14 | app.include_router(spider_router.spiderRouter) | 15 | app.include_router(spider_router.spiderRouter) |
15 | if __name__ == '__main__': | 16 | if __name__ == "__main__": |
16 | appConf = getAppConfig() | 17 | appConf = getAppConfig() |
17 | uvicorn.run("main:app", host="0.0.0.0", | 18 | uvicorn.run( |
18 | port=appConf.server.port, | 19 | "main:app", host="0.0.0.0", port=appConf.server.port, log_config=getLogConfig() |
19 | log_config=getLogConfig()) | 20 | ) |
20 | logging.info(f"启动成功->{appConf.server}") | 21 | logging.info(f"启动成功->{appConf.server}") | ... | ... |
... | @@ -10,24 +10,32 @@ class TypeEnum(IntEnum): | ... | @@ -10,24 +10,32 @@ class TypeEnum(IntEnum): |
10 | EPISODES = 1 # 剧集 | 10 | EPISODES = 1 # 剧集 |
11 | MOVIE = 2 # 电影 | 11 | MOVIE = 2 # 电影 |
12 | ANIME = 3 # 动漫 | 12 | ANIME = 3 # 动漫 |
13 | VARIETY = 4 # 综艺 | ||
13 | 14 | ||
14 | def get_precise_positioning(self, other) -> List[str]: | 15 | def get_precise_positioning(self, other) -> List[str]: |
15 | if self.ANIME == other: | 16 | if self.ANIME == other: |
16 | return ['.cateIcon_5', | 17 | return [".cateIcon_5", "li.anime-platform:nth-child(1)"] |
17 | 'li.anime-platform:nth-child(1)'] | ||
18 | if self.MOVIE == other: | 18 | if self.MOVIE == other: |
19 | return ['.cateIcon_3', | 19 | return [".cateIcon_3", "ul.type-box:nth-child(1) > li:nth-child(1)"] |
20 | 'ul.type-box:nth-child(1) > li:nth-child(1)'] | ||
21 | if self.EPISODES == other: | 20 | if self.EPISODES == other: |
22 | return ['.cateIcon_1', | 21 | return [ |
23 | 'div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)', | 22 | ".cateIcon_1", |
24 | 'ul.type-box:nth-child(1) > li:nth-child(1)'] | 23 | "div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)", |
24 | "ul.type-box:nth-child(1) > li:nth-child(1)", | ||
25 | ] | ||
26 | if self.VARIETY == other: | ||
27 | return [ | ||
28 | "div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)", | ||
29 | "ul.type-box:nth-child(1) > li:nth-child(1)", | ||
30 | ] | ||
25 | 31 | ||
26 | 32 | ||
27 | class GuoDuoSpiderResult(BaseModel): | 33 | class GuoDuoSpiderResult(BaseModel): |
28 | # time 数据时间 yyyy-MM-dd 格式的字符窜 | 34 | # time 数据时间 yyyy-MM-dd 格式的字符窜 |
29 | time: str = Field(..., description="数据时间 yyyy-MM-dd 格式的字符", pattern='^\d{4}-\d{2}-\d{2}$') | 35 | time: str = Field( |
30 | # type '1剧集 2电影 3 动漫', | 36 | ..., description="数据时间 yyyy-MM-dd 格式的字符", pattern="^\d{4}-\d{2}-\d{2}$" |
37 | ) | ||
38 | # type '1剧集 2电影 3 动漫 4综艺', | ||
31 | targetType: TypeEnum = Field(..., description="数据类型") | 39 | targetType: TypeEnum = Field(..., description="数据类型") |
32 | # platform 平台名字 | 40 | # platform 平台名字 |
33 | platform: str | 41 | platform: str |
... | @@ -43,8 +51,12 @@ class GuoDuoSpiderResult(BaseModel): | ... | @@ -43,8 +51,12 @@ class GuoDuoSpiderResult(BaseModel): |
43 | 51 | ||
44 | 52 | ||
45 | class SpiderParams(BaseModel): | 53 | class SpiderParams(BaseModel): |
46 | startDate: str = Field(..., description="数据时间 yyyy-MM-dd 格式的字符", pattern='^\d{4}-\d{2}-\d{2}$') | 54 | startDate: str = Field( |
47 | endDate: str = Field(..., description="数据时间 yyyy-MM-dd 格式的字符", pattern='^\d{4}-\d{2}-\d{2}$') | 55 | ..., description="数据时间 yyyy-MM-dd 格式的字符", pattern="^\d{4}-\d{2}-\d{2}$" |
56 | ) | ||
57 | endDate: str = Field( | ||
58 | ..., description="数据时间 yyyy-MM-dd 格式的字符", pattern="^\d{4}-\d{2}-\d{2}$" | ||
59 | ) | ||
48 | # type '1剧集 2电影 3 动漫', | 60 | # type '1剧集 2电影 3 动漫', |
49 | target_type: List[TypeEnum] | 61 | target_type: List[TypeEnum] |
50 | # 爬取的地址 | 62 | # 爬取的地址 |
... | @@ -57,12 +69,12 @@ class SpiderParams(BaseModel): | ... | @@ -57,12 +69,12 @@ class SpiderParams(BaseModel): |
57 | start_date = self.startDate | 69 | start_date = self.startDate |
58 | end_date = self.endDate | 70 | end_date = self.endDate |
59 | time_range = [] | 71 | time_range = [] |
60 | start_date_obj = datetime.strptime(start_date, '%Y-%m-%d') | 72 | start_date_obj = datetime.strptime(start_date, "%Y-%m-%d") |
61 | end_date_obj = datetime.strptime(end_date, '%Y-%m-%d') | 73 | end_date_obj = datetime.strptime(end_date, "%Y-%m-%d") |
62 | delta = end_date_obj - start_date_obj | 74 | delta = end_date_obj - start_date_obj |
63 | for i in range(delta.days + 1): | 75 | for i in range(delta.days + 1): |
64 | day = start_date_obj + timedelta(days=i) | 76 | day = start_date_obj + timedelta(days=i) |
65 | time_range.append(day.strftime('%Y-%m-%d')) | 77 | time_range.append(day.strftime("%Y-%m-%d")) |
66 | return time_range | 78 | return time_range |
67 | 79 | ||
68 | 80 | ... | ... |
... | @@ -5,7 +5,12 @@ from app.model.mysql_model import SpiderModel | ... | @@ -5,7 +5,12 @@ from app.model.mysql_model import SpiderModel |
5 | from superstream import Stream | 5 | from superstream import Stream |
6 | 6 | ||
7 | from app.job.job import scheduler | 7 | from app.job.job import scheduler |
8 | from app.schemas.spider_schema import ApschedulerJob, TaskInfo, SpiderParams, GuoDuoSpiderResult | 8 | from app.schemas.spider_schema import ( |
9 | ApschedulerJob, | ||
10 | TaskInfo, | ||
11 | SpiderParams, | ||
12 | GuoDuoSpiderResult, | ||
13 | ) | ||
9 | from app.spider.guduo_spider import startBrowser | 14 | from app.spider.guduo_spider import startBrowser |
10 | 15 | ||
11 | logger = logging.getLogger(__name__) | 16 | logger = logging.getLogger(__name__) |
... | @@ -16,21 +21,29 @@ def add_job(params: List[Any], taskInfo: TaskInfo, task: Any): | ... | @@ -16,21 +21,29 @@ def add_job(params: List[Any], taskInfo: TaskInfo, task: Any): |
16 | cron = taskInfo.executeStrategy | 21 | cron = taskInfo.executeStrategy |
17 | cron_parameterization = cron.split(" ") | 22 | cron_parameterization = cron.split(" ") |
18 | minute = cron_parameterization[1] | 23 | minute = cron_parameterization[1] |
19 | minute = (None if minute == "?" else minute) | 24 | minute = None if minute == "?" else minute |
20 | hour = cron_parameterization[2] | 25 | hour = cron_parameterization[2] |
21 | hour = (None if hour == "?" else hour) | 26 | hour = None if hour == "?" else hour |
22 | day = cron_parameterization[3] | 27 | day = cron_parameterization[3] |
23 | day = (None if day == "?" else day) | 28 | day = None if day == "?" else day |
24 | month = cron_parameterization[4] | 29 | month = cron_parameterization[4] |
25 | month = (None if month == "?" else month) | 30 | month = None if month == "?" else month |
26 | weekend = cron_parameterization[5] | 31 | weekend = cron_parameterization[5] |
27 | weekend = (None if weekend == "?" else weekend) | 32 | weekend = None if weekend == "?" else weekend |
28 | job = scheduler.add_job(task, "cron", | 33 | job = scheduler.add_job( |
29 | minute=minute, hour=hour, day=day, month=month, week=weekend, | 34 | task, |
30 | args=params) | 35 | "cron", |
36 | minute=minute, | ||
37 | hour=hour, | ||
38 | day=day, | ||
39 | month=month, | ||
40 | week=weekend, | ||
41 | args=params, | ||
42 | ) | ||
31 | jobs.append(ApschedulerJob(jobId=job, taskId=taskInfo.taskId)) | 43 | jobs.append(ApschedulerJob(jobId=job, taskId=taskInfo.taskId)) |
32 | logger.info( | 44 | logger.info( |
33 | f"添加任务成功,任务id:{taskInfo.taskId},任务执行策略:{taskInfo.executeStrategy}") | 45 | f"添加任务成功,任务id:{taskInfo.taskId},任务执行策略:{taskInfo.executeStrategy}" |
46 | ) | ||
34 | 47 | ||
35 | 48 | ||
36 | def add_guduo_job(taskParam: SpiderParams, taskInfo: TaskInfo): | 49 | def add_guduo_job(taskParam: SpiderParams, taskInfo: TaskInfo): |
... | @@ -52,18 +65,17 @@ def delete_job(taskId: int): | ... | @@ -52,18 +65,17 @@ def delete_job(taskId: int): |
52 | 65 | ||
53 | 66 | ||
54 | def get_job_info(taskId: int): | 67 | def get_job_info(taskId: int): |
55 | job = (Stream(jobs) | 68 | job = Stream(jobs).filter(lambda x: x.taskId == taskId).find_first() |
56 | .filter(lambda x: x.taskId == taskId) | 69 | return f"job 信息->{job}" |
57 | .find_first()) | ||
58 | return f'job 信息->{job}' | ||
59 | 70 | ||
60 | 71 | ||
61 | async def scrawl_and_save(taskParam: SpiderParams): | 72 | async def scrawl_and_save(taskParam: SpiderParams): |
62 | # 执行爬虫获取结果 | 73 | # 执行爬虫获取结果 |
63 | results = await startBrowser(taskParam) | 74 | results = await startBrowser(taskParam) |
75 | logger.info(f"爬虫重试情况:{startBrowser.statistics}") | ||
64 | asyncTasks = (save_or_update(item) for item in results) | 76 | asyncTasks = (save_or_update(item) for item in results) |
65 | await asyncio.gather(*asyncTasks) | 77 | await asyncio.gather(*asyncTasks) |
66 | logger.info(f'爬虫任务执行完成,爬取到数据{len(results)}条 保存到数据库完成') | 78 | logger.info(f"爬虫任务执行完成,爬取到数据{len(results)}条 保存到数据库完成") |
67 | 79 | ||
68 | 80 | ||
69 | async def save_or_update(result: GuoDuoSpiderResult): | 81 | async def save_or_update(result: GuoDuoSpiderResult): |
... | @@ -75,10 +87,7 @@ async def save_or_update(result: GuoDuoSpiderResult): | ... | @@ -75,10 +87,7 @@ async def save_or_update(result: GuoDuoSpiderResult): |
75 | createTime = result.createTime | 87 | createTime = result.createTime |
76 | # 使用SpiderModel 查询 time targetType platform sourceName 等于以上的值是否存在如果存在就更新不存在就插入 | 88 | # 使用SpiderModel 查询 time targetType platform sourceName 等于以上的值是否存在如果存在就更新不存在就插入 |
77 | obj = await SpiderModel.get_or_none( | 89 | obj = await SpiderModel.get_or_none( |
78 | time=time, | 90 | time=time, targetType=targetType, platform=platform, sourceName=sourceName |
79 | targetType=targetType, | ||
80 | platform=platform, | ||
81 | sourceName=sourceName | ||
82 | ) | 91 | ) |
83 | if obj: | 92 | if obj: |
84 | for key, value in result.__dict__.items(): | 93 | for key, value in result.__dict__.items(): |
... | @@ -90,6 +99,6 @@ async def save_or_update(result: GuoDuoSpiderResult): | ... | @@ -90,6 +99,6 @@ async def save_or_update(result: GuoDuoSpiderResult): |
90 | platform=platform, | 99 | platform=platform, |
91 | sourceName=sourceName, | 100 | sourceName=sourceName, |
92 | score=score, | 101 | score=score, |
93 | createTime=createTime | 102 | createTime=createTime, |
94 | ) | 103 | ) |
95 | await obj.save() | 104 | await obj.save() | ... | ... |
... | @@ -7,10 +7,24 @@ from superstream import Stream | ... | @@ -7,10 +7,24 @@ from superstream import Stream |
7 | from app.schemas.spider_schema import GuoDuoSpiderResult, SpiderParams, TypeEnum | 7 | from app.schemas.spider_schema import GuoDuoSpiderResult, SpiderParams, TypeEnum |
8 | import logging | 8 | import logging |
9 | from tqdm.asyncio import tqdm_asyncio | 9 | from tqdm.asyncio import tqdm_asyncio |
10 | from tenacity import ( | ||
11 | after_log, | ||
12 | before_sleep_log, | ||
13 | retry, | ||
14 | stop_after_attempt, | ||
15 | wait_exponential, | ||
16 | wait_fixed, | ||
17 | ) | ||
10 | 18 | ||
11 | logger = logging.getLogger(__name__) | 19 | logger = logging.getLogger(__name__) |
12 | 20 | ||
13 | 21 | ||
22 | @retry( | ||
23 | stop=stop_after_attempt(3), | ||
24 | before_sleep=before_sleep_log(logger, logging.DEBUG), | ||
25 | wait=wait_exponential(multiplier=1, min=1, max=10), | ||
26 | after=after_log(logger, logging.DEBUG), | ||
27 | ) | ||
14 | async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: | 28 | async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: |
15 | time_range = spiderParam.get_time_range() | 29 | time_range = spiderParam.get_time_range() |
16 | url = spiderParam.url | 30 | url = spiderParam.url |
... | @@ -18,34 +32,41 @@ async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: | ... | @@ -18,34 +32,41 @@ async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: |
18 | async with async_playwright() as p: | 32 | async with async_playwright() as p: |
19 | browser = await p.firefox.launch(headless=True) | 33 | browser = await p.firefox.launch(headless=True) |
20 | # 创建指定个数的浏览器页面 | 34 | # 创建指定个数的浏览器页面 |
21 | pages = await tqdm_asyncio.gather(*(browser.new_page() | 35 | pages = await tqdm_asyncio.gather( |
22 | for _ in range(len(time_range)))) | 36 | *(browser.new_page() for _ in range(len(time_range))) |
37 | ) | ||
23 | # 同时循环time_range与pages 去调用hand_one_data异步方法 | 38 | # 同时循环time_range与pages 去调用hand_one_data异步方法 |
24 | results = await tqdm_asyncio.gather(*(hand_one_data(time_range[i], pages[i], | 39 | results = await tqdm_asyncio.gather( |
25 | scrawl_types, url) | 40 | *( |
26 | for i in range(len(time_range)))) | 41 | hand_one_data(time_range[i], pages[i], scrawl_types, url) |
27 | return (Stream(results) | 42 | for i in range(len(time_range)) |
43 | ) | ||
44 | ) | ||
45 | return ( | ||
46 | Stream(results) | ||
28 | .filter(lambda x: x is not None) | 47 | .filter(lambda x: x is not None) |
29 | .filter(lambda x: len(x) > 0) | 48 | .filter(lambda x: len(x) > 0) |
30 | .flat_map(lambda x: Stream(x)) | 49 | .flat_map(lambda x: Stream(x)) |
31 | .to_list()) | 50 | .to_list() |
51 | ) | ||
32 | 52 | ||
33 | 53 | ||
34 | async def hand_one_data(targetDate: str, | 54 | async def hand_one_data( |
35 | page: Page, | 55 | targetDate: str, page: Page, scrawl_types: List[TypeEnum], url: str |
36 | scrawl_types: List[TypeEnum], | 56 | ) -> List[GuoDuoSpiderResult]: |
37 | url: str) -> List[GuoDuoSpiderResult]: | ||
38 | # 把date这个日期对象解析为年、月、日 | 57 | # 把date这个日期对象解析为年、月、日 |
39 | year, month, day = targetDate.split('-') | 58 | year, month, day = targetDate.split("-") |
40 | day = day.replace('0', '') | 59 | day = day.replace("0", "") |
41 | month = int(month) - 1 | 60 | month = int(month) - 1 |
42 | await page.goto(url) | 61 | await page.goto(url) |
43 | # 等待遮罩层消失 | 62 | # 等待遮罩层消失 |
44 | await page.click('//*[@id="currentDateTxt"]') | 63 | await page.click('//*[@id="currentDateTxt"]') |
45 | data_css_selector = (f'#day span.pika-button.pika-day[type="button"]' | 64 | data_css_selector = ( |
65 | f'#day span.pika-button.pika-day[type="button"]' | ||
46 | f'[data-pika-year="{year}"]' | 66 | f'[data-pika-year="{year}"]' |
47 | f'[data-pika-month="{month}"]' | 67 | f'[data-pika-month="{month}"]' |
48 | f'[data-pika-day="{day}"]') | 68 | f'[data-pika-day="{day}"]' |
69 | ) | ||
49 | doc = page.locator(data_css_selector) | 70 | doc = page.locator(data_css_selector) |
50 | # 判断指定元素是否存在如果不存在就返回空的[] | 71 | # 判断指定元素是否存在如果不存在就返回空的[] |
51 | if not await doc.is_visible(): | 72 | if not await doc.is_visible(): |
... | @@ -53,12 +74,14 @@ async def hand_one_data(targetDate: str, | ... | @@ -53,12 +74,14 @@ async def hand_one_data(targetDate: str, |
53 | # 点击指定日期 | 74 | # 点击指定日期 |
54 | await doc.click() | 75 | await doc.click() |
55 | # 最后一步修正定位 | 76 | # 最后一步修正定位 |
56 | css_selectors = (Stream(scrawl_types) | 77 | css_selectors = ( |
78 | Stream(scrawl_types) | ||
57 | .map(lambda x: (x, x.get_precise_positioning(x))) | 79 | .map(lambda x: (x, x.get_precise_positioning(x))) |
58 | .group_by(lambda x: x[0])) | 80 | .group_by(lambda x: x[0]) |
81 | ) | ||
59 | results = [] | 82 | results = [] |
60 | for key, value in css_selectors.items(): | 83 | for key, value in css_selectors.items(): |
61 | logger.info(f'开始爬取 {targetDate} 类型是{value[0][0]} 的数据') | 84 | logger.info(f"开始爬取 {targetDate} 类型是{value[0][0]} 的数据") |
62 | for css_selector in value[0][1]: | 85 | for css_selector in value[0][1]: |
63 | await page.click(css_selector) | 86 | await page.click(css_selector) |
64 | result = await get_data(page, key.value, targetDate) | 87 | result = await get_data(page, key.value, targetDate) |
... | @@ -67,39 +90,53 @@ async def hand_one_data(targetDate: str, | ... | @@ -67,39 +90,53 @@ async def hand_one_data(targetDate: str, |
67 | return results | 90 | return results |
68 | 91 | ||
69 | 92 | ||
70 | async def get_data(targetPage: Page, scrawl_type: int, target_time: str) -> Optional[List[GuoDuoSpiderResult]]: | 93 | async def get_data( |
71 | target_data_css_selector = '.rank-box' | 94 | targetPage: Page, scrawl_type: int, target_time: str |
95 | ) -> Optional[List[GuoDuoSpiderResult]]: | ||
96 | target_data_css_selector = ".rank-box" | ||
72 | table = targetPage.locator(target_data_css_selector) | 97 | table = targetPage.locator(target_data_css_selector) |
73 | if table is None: | 98 | if table is None: |
74 | print(f'当前页面获取table为空:{scrawl_type}') | 99 | print(f"当前页面获取table为空:{scrawl_type}") |
75 | return None | 100 | return None |
76 | # 获取当前这个doc_obj元素下面所有的tr标签 | 101 | # 获取当前这个doc_obj元素下面所有的tr标签 |
77 | trs = table.locator('tr') | 102 | trs = table.locator("tr") |
78 | if trs is None: | 103 | if trs is None: |
79 | print(f'当前页面获取tr为空:{scrawl_type}') | 104 | print(f"当前页面获取tr为空:{scrawl_type}") |
80 | # 循环遍历trs下面的每个tr标签下面的td标签 | 105 | # 循环遍历trs下面的每个tr标签下面的td标签 |
81 | at = await trs.all_text_contents() | 106 | at = await trs.all_text_contents() |
82 | result = (Stream(at) | 107 | result = ( |
83 | .filter(lambda x: x.strip() != '') | 108 | Stream(at) |
84 | .filter(lambda x: len(x.split(' ')) > 6) | 109 | .filter(lambda x: x.strip() != "") |
85 | .map(lambda x: GuoDuoSpiderResult(time=target_time, | 110 | .filter(lambda x: len(x.split(" ")) > 6) |
111 | .map( | ||
112 | lambda x: GuoDuoSpiderResult( | ||
113 | time=target_time, | ||
86 | targetType=TypeEnum(scrawl_type), | 114 | targetType=TypeEnum(scrawl_type), |
87 | createTime=datetime.now(), | 115 | createTime=datetime.now(), |
88 | platform='all', | 116 | platform="all", |
89 | score=float(x.split(' ')[-1]), | 117 | score=float(x.split(" ")[-1]), |
90 | sourceName=x.split(' ')[5])) | 118 | sourceName=x.split(" ")[5], |
91 | .to_list()) | 119 | ) |
120 | ) | ||
121 | .to_list() | ||
122 | ) | ||
92 | return result | 123 | return result |
93 | 124 | ||
94 | 125 | ||
95 | if __name__ == '__main__': | 126 | if __name__ == "__main__": |
96 | # 获取程序开始执行的时间 | 127 | # 获取程序开始执行的时间 |
97 | start_time = time.time() | 128 | start_time = time.time() |
98 | param = SpiderParams(startDate='2024-01-02', | 129 | param = SpiderParams( |
99 | endDate='2024-01-15', | 130 | startDate="2024-01-02", |
100 | target_type=[TypeEnum.ANIME, TypeEnum.EPISODES, TypeEnum.MOVIE] | 131 | endDate="2024-01-15", |
101 | , url='http://guduodata.com/') | 132 | target_type=[ |
133 | TypeEnum.ANIME, | ||
134 | TypeEnum.EPISODES, | ||
135 | TypeEnum.MOVIE, | ||
136 | TypeEnum.VARIETY, | ||
137 | ], | ||
138 | url="http://guduodata.com/", | ||
139 | ) | ||
102 | res = asyncio.run(startBrowser(param)) | 140 | res = asyncio.run(startBrowser(param)) |
103 | print(f'程序执行耗时时间:{(time.time() - start_time)/1000}长度为:{len(res)}') | 141 | print(f"代码重试情况:{startBrowser.statistics}") |
104 | for it in res: | 142 | print(f"程序执行耗时时间:{(time.time() - start_time)/1000}长度为:{len(res)}") |
105 | print(it) | ... | ... |
requirements.txt
0 → 100644
1 | aiosqlite==0.20.0 | ||
2 | annotated-types==0.7.0 | ||
3 | anyio==4.7.0 | ||
4 | APScheduler==3.11.0 | ||
5 | asyncmy==0.2.10 | ||
6 | click==8.1.7 | ||
7 | colorama==0.4.6 | ||
8 | distlib==0.3.8 | ||
9 | exceptiongroup==1.2.2 | ||
10 | fastapi==0.115.6 | ||
11 | greenlet==3.1.1 | ||
12 | h11==0.14.0 | ||
13 | idna==3.10 | ||
14 | iso8601==2.1.0 | ||
15 | pandoc==2.3 | ||
16 | playwright==1.49.1 | ||
17 | plumbum==1.8.3 | ||
18 | ply==3.11 | ||
19 | pydantic==2.10.4 | ||
20 | pydantic_core==2.27.2 | ||
21 | pyee==12.0.0 | ||
22 | pypika-tortoise==0.3.2 | ||
23 | pytz==2024.2 | ||
24 | pywin32==306 | ||
25 | PyYAML==6.0.2 | ||
26 | sniffio==1.3.1 | ||
27 | starlette==0.41.3 | ||
28 | SuperStream==0.2.6 | ||
29 | tenacity==9.0.0 | ||
30 | tortoise-orm==0.22.2 | ||
31 | tqdm==4.67.1 | ||
32 | typing_extensions==4.12.2 | ||
33 | tzdata==2024.2 | ||
34 | tzlocal==5.2 | ||
35 | uvicorn==0.34.0 | ||
36 | virtualenv==20.25.0 |
-
Please register or sign in to post a comment