Commit 39153b19 39153b19d1b22fa064762cd2f66fbf548b05e89b by wenxin

update

1 parent 828ad47e
...@@ -3,3 +3,10 @@ ...@@ -3,3 +3,10 @@
3 app/**/__pycache__ 3 app/**/__pycache__
4 4
5 app/logs/* 5 app/logs/*
6
7 .DS_Store
8
9 ./**/.DS_Store
10
11 ./.DS_Store
12
......
1 {
2 "version": "0.2.0",
3 "configurations": [
4 {
5 "name": "Python: Current File with Modified PYTHONPATH",
6 "type": "python",
7 "request": "launch",
8 "program": "${file}",
9 "console": "integratedTerminal",
10 "justMyCode": true,
11 "env": {
12 "PYTHONPATH": "${workspaceFolder}/parent_directory:${env:PYTHONPATH}"
13 }
14 }
15 ]
16 }
...@@ -6,6 +6,7 @@ from playwright.async_api import Page, async_playwright ...@@ -6,6 +6,7 @@ from playwright.async_api import Page, async_playwright
6 from superstream import Stream 6 from superstream import Stream
7 from app.schemas.spider_schema import GuoDuoSpiderResult, SpiderParams, TypeEnum 7 from app.schemas.spider_schema import GuoDuoSpiderResult, SpiderParams, TypeEnum
8 import logging 8 import logging
9 from tqdm.asyncio import tqdm
9 10
10 logger = logging.getLogger(__name__) 11 logger = logging.getLogger(__name__)
11 12
...@@ -17,34 +18,41 @@ async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: ...@@ -17,34 +18,41 @@ async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]:
17 async with async_playwright() as p: 18 async with async_playwright() as p:
18 browser = await p.firefox.launch(headless=True) 19 browser = await p.firefox.launch(headless=True)
19 # 创建指定个数的浏览器页面 20 # 创建指定个数的浏览器页面
20 pages = await asyncio.gather(*(browser.new_page() 21 page_tasks = (browser.new_page() for _ in range(len(time_range)))
21 for _ in range(len(time_range)))) 22 pages_generator = tqdm.as_completed(page_tasks)
23 pages = list(pages_generator)
22 # 同时循环time_range与pages 去调用hand_one_data异步方法 24 # 同时循环time_range与pages 去调用hand_one_data异步方法
23 results = await asyncio.gather(*(hand_one_data(time_range[i], pages[i], 25 hand_result_tasks = (
24 scrawl_types, url) 26 hand_one_data(time_range[i], pages[i], scrawl_types, url)
25 for i in range(len(time_range)))) 27 for i in range(len(time_range))
26 return (Stream(results) 28 )
29 results__generator = tqdm.as_completed(hand_result_tasks)
30 results = list(results__generator)
31 return (
32 Stream(results)
27 .filter(lambda x: x is not None) 33 .filter(lambda x: x is not None)
28 .filter(lambda x: len(x) > 0) 34 .filter(lambda x: len(x) > 0)
29 .flat_map(lambda x: Stream(x)) 35 .flat_map(lambda x: Stream(x))
30 .to_list()) 36 .to_list()
37 )
31 38
32 39
33 async def hand_one_data(targetDate: str, 40 async def hand_one_data(
34 page: Page, 41 targetDate: str, page: Page, scrawl_types: List[TypeEnum], url: str
35 scrawl_types: List[TypeEnum], 42 ) -> List[GuoDuoSpiderResult]:
36 url: str) -> List[GuoDuoSpiderResult]:
37 # 把date这个日期对象解析为年、月、日 43 # 把date这个日期对象解析为年、月、日
38 year, month, day = targetDate.split('-') 44 year, month, day = targetDate.split("-")
39 day = day.replace('0', '') 45 day = day.replace("0", "")
40 month = int(month) - 1 46 month = int(month) - 1
41 await page.goto(url) 47 await page.goto(url)
42 # 等待遮罩层消失 48 # 等待遮罩层消失
43 await page.click('//*[@id="currentDateTxt"]') 49 await page.click('//*[@id="currentDateTxt"]')
44 data_css_selector = (f'#day span.pika-button.pika-day[type="button"]' 50 data_css_selector = (
51 f'#day span.pika-button.pika-day[type="button"]'
45 f'[data-pika-year="{year}"]' 52 f'[data-pika-year="{year}"]'
46 f'[data-pika-month="{month}"]' 53 f'[data-pika-month="{month}"]'
47 f'[data-pika-day="{day}"]') 54 f'[data-pika-day="{day}"]'
55 )
48 doc = page.locator(data_css_selector) 56 doc = page.locator(data_css_selector)
49 # 判断指定元素是否存在如果不存在就返回空的[] 57 # 判断指定元素是否存在如果不存在就返回空的[]
50 if not await doc.is_visible(): 58 if not await doc.is_visible():
...@@ -52,12 +60,14 @@ async def hand_one_data(targetDate: str, ...@@ -52,12 +60,14 @@ async def hand_one_data(targetDate: str,
52 # 点击指定日期 60 # 点击指定日期
53 await doc.click() 61 await doc.click()
54 # 最后一步修正定位 62 # 最后一步修正定位
55 css_selectors = (Stream(scrawl_types) 63 css_selectors = (
64 Stream(scrawl_types)
56 .map(lambda x: (x, x.get_precise_positioning(x))) 65 .map(lambda x: (x, x.get_precise_positioning(x)))
57 .group_by(lambda x: x[0])) 66 .group_by(lambda x: x[0])
67 )
58 results = [] 68 results = []
59 for key, value in css_selectors.items(): 69 for key, value in css_selectors.items():
60 logger.info(f'开始爬取 {targetDate} 类型是{value[0][0]} 的数据') 70 logger.info(f"开始爬取 {targetDate} 类型是{value[0][0]} 的数据")
61 for css_selector in value[0][1]: 71 for css_selector in value[0][1]:
62 await page.click(css_selector) 72 await page.click(css_selector)
63 result = await get_data(page, key.value, targetDate) 73 result = await get_data(page, key.value, targetDate)
...@@ -66,39 +76,49 @@ async def hand_one_data(targetDate: str, ...@@ -66,39 +76,49 @@ async def hand_one_data(targetDate: str,
66 return results 76 return results
67 77
68 78
69 async def get_data(targetPage: Page, scrawl_type: int, target_time: str) -> Optional[List[GuoDuoSpiderResult]]: 79 async def get_data(
70 target_data_css_selector = '.rank-box' 80 targetPage: Page, scrawl_type: int, target_time: str
81 ) -> Optional[List[GuoDuoSpiderResult]]:
82 target_data_css_selector = ".rank-box"
71 table = targetPage.locator(target_data_css_selector) 83 table = targetPage.locator(target_data_css_selector)
72 if table is None: 84 if table is None:
73 print(f'当前页面获取table为空:{scrawl_type}') 85 logger.info(f"当前页面获取table为空:{scrawl_type}")
74 return None 86 return None
75 # 获取当前这个doc_obj元素下面所有的tr标签 87 # 获取当前这个doc_obj元素下面所有的tr标签
76 trs = table.locator('tr') 88 trs = table.locator("tr")
77 if trs is None: 89 if trs is None:
78 print(f'当前页面获取tr为空:{scrawl_type}') 90 logger.info(f"当前页面获取tr为空:{scrawl_type}")
79 # 循环遍历trs下面的每个tr标签下面的td标签 91 # 循环遍历trs下面的每个tr标签下面的td标签
80 at = await trs.all_text_contents() 92 at = await trs.all_text_contents()
81 result = (Stream(at) 93 result = (
82 .filter(lambda x: x.strip() != '') 94 Stream(at)
83 .filter(lambda x: len(x.split(' ')) > 6) 95 .filter(lambda x: x.strip() != "")
84 .map(lambda x: GuoDuoSpiderResult(time=target_time, 96 .filter(lambda x: len(x.split(" ")) > 6)
97 .map(
98 lambda x: GuoDuoSpiderResult(
99 time=target_time,
85 targetType=TypeEnum(scrawl_type), 100 targetType=TypeEnum(scrawl_type),
86 createTime=datetime.now(), 101 createTime=datetime.now(),
87 platform='all', 102 platform="all",
88 score=float(x.split(' ')[-1]), 103 score=float(x.split(" ")[-1]),
89 sourceName=x.split(' ')[5])) 104 sourceName=x.split(" ")[5],
90 .to_list()) 105 )
106 )
107 .to_list()
108 )
91 return result 109 return result
92 110
93 111
94 if __name__ == '__main__': 112 if __name__ == "__main__":
95 # 获取程序开始执行的时间 113 # 获取程序开始执行的时间
96 start_time = time.time() 114 start_time = time.time()
97 param = SpiderParams(startDate='1991-01-02', 115 param = SpiderParams(
98 endDate='1991-01-05', 116 startDate="1991-01-02",
99 target_type=[TypeEnum.ANIME, TypeEnum.EPISODES, TypeEnum.MOVIE] 117 endDate="1991-01-05",
100 , url='http://guduodata.com/') 118 target_type=[TypeEnum.ANIME, TypeEnum.EPISODES, TypeEnum.MOVIE],
119 url="http://guduodata.com/",
120 )
101 res = asyncio.run(startBrowser(param)) 121 res = asyncio.run(startBrowser(param))
102 print(f'程序执行耗时时间:{time.time() - start_time} 长度为:{len(res)}') 122 print(f"程序执行耗时时间:{time.time() - start_time} 长度为:{len(res)}")
103 for it in res: 123 for it in res:
104 print(it) 124 print(it)
......
...@@ -21,6 +21,7 @@ sniffio==1.3.1 ...@@ -21,6 +21,7 @@ sniffio==1.3.1
21 starlette==0.41.3 21 starlette==0.41.3
22 SuperStream==0.2.6 22 SuperStream==0.2.6
23 tortoise-orm==0.22.2 23 tortoise-orm==0.22.2
24 tqdm==4.67.1
24 typing_extensions==4.12.2 25 typing_extensions==4.12.2
25 tzlocal==5.2 26 tzlocal==5.2
26 uvicorn==0.34.0 27 uvicorn==0.34.0
......