update
Showing
5 changed files
with
81 additions
and
37 deletions
.vscode/launch.json
0 → 100644
1 | { | ||
2 | "version": "0.2.0", | ||
3 | "configurations": [ | ||
4 | { | ||
5 | "name": "Python: Current File with Modified PYTHONPATH", | ||
6 | "type": "python", | ||
7 | "request": "launch", | ||
8 | "program": "${file}", | ||
9 | "console": "integratedTerminal", | ||
10 | "justMyCode": true, | ||
11 | "env": { | ||
12 | "PYTHONPATH": "${workspaceFolder}/parent_directory:${env:PYTHONPATH}" | ||
13 | } | ||
14 | } | ||
15 | ] | ||
16 | } |
No preview for this file type
... | @@ -6,6 +6,7 @@ from playwright.async_api import Page, async_playwright | ... | @@ -6,6 +6,7 @@ from playwright.async_api import Page, async_playwright |
6 | from superstream import Stream | 6 | from superstream import Stream |
7 | from app.schemas.spider_schema import GuoDuoSpiderResult, SpiderParams, TypeEnum | 7 | from app.schemas.spider_schema import GuoDuoSpiderResult, SpiderParams, TypeEnum |
8 | import logging | 8 | import logging |
9 | from tqdm.asyncio import tqdm | ||
9 | 10 | ||
10 | logger = logging.getLogger(__name__) | 11 | logger = logging.getLogger(__name__) |
11 | 12 | ||
... | @@ -17,34 +18,41 @@ async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: | ... | @@ -17,34 +18,41 @@ async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: |
17 | async with async_playwright() as p: | 18 | async with async_playwright() as p: |
18 | browser = await p.firefox.launch(headless=True) | 19 | browser = await p.firefox.launch(headless=True) |
19 | # 创建指定个数的浏览器页面 | 20 | # 创建指定个数的浏览器页面 |
20 | pages = await asyncio.gather(*(browser.new_page() | 21 | page_tasks = (browser.new_page() for _ in range(len(time_range))) |
21 | for _ in range(len(time_range)))) | 22 | pages_generator = tqdm.as_completed(page_tasks) |
23 | pages = list(pages_generator) | ||
22 | # 同时循环time_range与pages 去调用hand_one_data异步方法 | 24 | # 同时循环time_range与pages 去调用hand_one_data异步方法 |
23 | results = await asyncio.gather(*(hand_one_data(time_range[i], pages[i], | 25 | hand_result_tasks = ( |
24 | scrawl_types, url) | 26 | hand_one_data(time_range[i], pages[i], scrawl_types, url) |
25 | for i in range(len(time_range)))) | 27 | for i in range(len(time_range)) |
26 | return (Stream(results) | 28 | ) |
29 | results__generator = tqdm.as_completed(hand_result_tasks) | ||
30 | results = list(results__generator) | ||
31 | return ( | ||
32 | Stream(results) | ||
27 | .filter(lambda x: x is not None) | 33 | .filter(lambda x: x is not None) |
28 | .filter(lambda x: len(x) > 0) | 34 | .filter(lambda x: len(x) > 0) |
29 | .flat_map(lambda x: Stream(x)) | 35 | .flat_map(lambda x: Stream(x)) |
30 | .to_list()) | 36 | .to_list() |
37 | ) | ||
31 | 38 | ||
32 | 39 | ||
33 | async def hand_one_data(targetDate: str, | 40 | async def hand_one_data( |
34 | page: Page, | 41 | targetDate: str, page: Page, scrawl_types: List[TypeEnum], url: str |
35 | scrawl_types: List[TypeEnum], | 42 | ) -> List[GuoDuoSpiderResult]: |
36 | url: str) -> List[GuoDuoSpiderResult]: | ||
37 | # 把date这个日期对象解析为年、月、日 | 43 | # 把date这个日期对象解析为年、月、日 |
38 | year, month, day = targetDate.split('-') | 44 | year, month, day = targetDate.split("-") |
39 | day = day.replace('0', '') | 45 | day = day.replace("0", "") |
40 | month = int(month) - 1 | 46 | month = int(month) - 1 |
41 | await page.goto(url) | 47 | await page.goto(url) |
42 | # 等待遮罩层消失 | 48 | # 等待遮罩层消失 |
43 | await page.click('//*[@id="currentDateTxt"]') | 49 | await page.click('//*[@id="currentDateTxt"]') |
44 | data_css_selector = (f'#day span.pika-button.pika-day[type="button"]' | 50 | data_css_selector = ( |
51 | f'#day span.pika-button.pika-day[type="button"]' | ||
45 | f'[data-pika-year="{year}"]' | 52 | f'[data-pika-year="{year}"]' |
46 | f'[data-pika-month="{month}"]' | 53 | f'[data-pika-month="{month}"]' |
47 | f'[data-pika-day="{day}"]') | 54 | f'[data-pika-day="{day}"]' |
55 | ) | ||
48 | doc = page.locator(data_css_selector) | 56 | doc = page.locator(data_css_selector) |
49 | # 判断指定元素是否存在如果不存在就返回空的[] | 57 | # 判断指定元素是否存在如果不存在就返回空的[] |
50 | if not await doc.is_visible(): | 58 | if not await doc.is_visible(): |
... | @@ -52,12 +60,14 @@ async def hand_one_data(targetDate: str, | ... | @@ -52,12 +60,14 @@ async def hand_one_data(targetDate: str, |
52 | # 点击指定日期 | 60 | # 点击指定日期 |
53 | await doc.click() | 61 | await doc.click() |
54 | # 最后一步修正定位 | 62 | # 最后一步修正定位 |
55 | css_selectors = (Stream(scrawl_types) | 63 | css_selectors = ( |
64 | Stream(scrawl_types) | ||
56 | .map(lambda x: (x, x.get_precise_positioning(x))) | 65 | .map(lambda x: (x, x.get_precise_positioning(x))) |
57 | .group_by(lambda x: x[0])) | 66 | .group_by(lambda x: x[0]) |
67 | ) | ||
58 | results = [] | 68 | results = [] |
59 | for key, value in css_selectors.items(): | 69 | for key, value in css_selectors.items(): |
60 | logger.info(f'开始爬取 {targetDate} 类型是{value[0][0]} 的数据') | 70 | logger.info(f"开始爬取 {targetDate} 类型是{value[0][0]} 的数据") |
61 | for css_selector in value[0][1]: | 71 | for css_selector in value[0][1]: |
62 | await page.click(css_selector) | 72 | await page.click(css_selector) |
63 | result = await get_data(page, key.value, targetDate) | 73 | result = await get_data(page, key.value, targetDate) |
... | @@ -66,39 +76,49 @@ async def hand_one_data(targetDate: str, | ... | @@ -66,39 +76,49 @@ async def hand_one_data(targetDate: str, |
66 | return results | 76 | return results |
67 | 77 | ||
68 | 78 | ||
69 | async def get_data(targetPage: Page, scrawl_type: int, target_time: str) -> Optional[List[GuoDuoSpiderResult]]: | 79 | async def get_data( |
70 | target_data_css_selector = '.rank-box' | 80 | targetPage: Page, scrawl_type: int, target_time: str |
81 | ) -> Optional[List[GuoDuoSpiderResult]]: | ||
82 | target_data_css_selector = ".rank-box" | ||
71 | table = targetPage.locator(target_data_css_selector) | 83 | table = targetPage.locator(target_data_css_selector) |
72 | if table is None: | 84 | if table is None: |
73 | print(f'当前页面获取table为空:{scrawl_type}') | 85 | logger.info(f"当前页面获取table为空:{scrawl_type}") |
74 | return None | 86 | return None |
75 | # 获取当前这个doc_obj元素下面所有的tr标签 | 87 | # 获取当前这个doc_obj元素下面所有的tr标签 |
76 | trs = table.locator('tr') | 88 | trs = table.locator("tr") |
77 | if trs is None: | 89 | if trs is None: |
78 | print(f'当前页面获取tr为空:{scrawl_type}') | 90 | logger.info(f"当前页面获取tr为空:{scrawl_type}") |
79 | # 循环遍历trs下面的每个tr标签下面的td标签 | 91 | # 循环遍历trs下面的每个tr标签下面的td标签 |
80 | at = await trs.all_text_contents() | 92 | at = await trs.all_text_contents() |
81 | result = (Stream(at) | 93 | result = ( |
82 | .filter(lambda x: x.strip() != '') | 94 | Stream(at) |
83 | .filter(lambda x: len(x.split(' ')) > 6) | 95 | .filter(lambda x: x.strip() != "") |
84 | .map(lambda x: GuoDuoSpiderResult(time=target_time, | 96 | .filter(lambda x: len(x.split(" ")) > 6) |
97 | .map( | ||
98 | lambda x: GuoDuoSpiderResult( | ||
99 | time=target_time, | ||
85 | targetType=TypeEnum(scrawl_type), | 100 | targetType=TypeEnum(scrawl_type), |
86 | createTime=datetime.now(), | 101 | createTime=datetime.now(), |
87 | platform='all', | 102 | platform="all", |
88 | score=float(x.split(' ')[-1]), | 103 | score=float(x.split(" ")[-1]), |
89 | sourceName=x.split(' ')[5])) | 104 | sourceName=x.split(" ")[5], |
90 | .to_list()) | 105 | ) |
106 | ) | ||
107 | .to_list() | ||
108 | ) | ||
91 | return result | 109 | return result |
92 | 110 | ||
93 | 111 | ||
94 | if __name__ == '__main__': | 112 | if __name__ == "__main__": |
95 | # 获取程序开始执行的时间 | 113 | # 获取程序开始执行的时间 |
96 | start_time = time.time() | 114 | start_time = time.time() |
97 | param = SpiderParams(startDate='1991-01-02', | 115 | param = SpiderParams( |
98 | endDate='1991-01-05', | 116 | startDate="1991-01-02", |
99 | target_type=[TypeEnum.ANIME, TypeEnum.EPISODES, TypeEnum.MOVIE] | 117 | endDate="1991-01-05", |
100 | , url='http://guduodata.com/') | 118 | target_type=[TypeEnum.ANIME, TypeEnum.EPISODES, TypeEnum.MOVIE], |
119 | url="http://guduodata.com/", | ||
120 | ) | ||
101 | res = asyncio.run(startBrowser(param)) | 121 | res = asyncio.run(startBrowser(param)) |
102 | print(f'程序执行耗时时间:{time.time() - start_time} 长度为:{len(res)}') | 122 | print(f"程序执行耗时时间:{time.time() - start_time} 长度为:{len(res)}") |
103 | for it in res: | 123 | for it in res: |
104 | print(it) | 124 | print(it) | ... | ... |
... | @@ -21,6 +21,7 @@ sniffio==1.3.1 | ... | @@ -21,6 +21,7 @@ sniffio==1.3.1 |
21 | starlette==0.41.3 | 21 | starlette==0.41.3 |
22 | SuperStream==0.2.6 | 22 | SuperStream==0.2.6 |
23 | tortoise-orm==0.22.2 | 23 | tortoise-orm==0.22.2 |
24 | tqdm==4.67.1 | ||
24 | typing_extensions==4.12.2 | 25 | typing_extensions==4.12.2 |
25 | tzlocal==5.2 | 26 | tzlocal==5.2 |
26 | uvicorn==0.34.0 | 27 | uvicorn==0.34.0 | ... | ... |
-
Please register or sign in to post a comment