update

wenxin
Commit 39153b19 ... 39153b19d1b22fa064762cd2f66fbf548b05e89b authored 2024-12-23 09:34:49 +0800 by wenxin
Showing 5 changed files with 81 additions and 37 deletions
.gitignore
.vscode/launch.json
app/spider/__pycache__/guduo_spider.cpython-310.pyc
app/spider/guduo_spider.py
requirements.txt
--- a/.gitignore
View file @39153b1
+++ b/.gitignore
View file @39153b1
@@ -3,3 +3,10 @@
 app/**/__pycache__

 app/logs/*
+
+.DS_Store
+
+./**/.DS_Store 
+
+./.DS_Store
+
--- a/.vscode/launch.json 0 → 100644
View file @39153b1
+++ b/.vscode/launch.json 0 → 100644
View file @39153b1
+{
+  "version": "0.2.0",
+  "configurations": [
+    {
+      "name": "Python: Current File with Modified PYTHONPATH",
+      "type": "python",
+      "request": "launch",
+      "program": "${file}",
+      "console": "integratedTerminal",
+      "justMyCode": true,
+      "env": {
+        "PYTHONPATH": "${workspaceFolder}/parent_directory:${env:PYTHONPATH}"
+      }
+    }
+  ]
+}
--- a/app/spider/__pycache__/guduo_spider.cpython-310.pyc deleted 100644 → 0
View file @828ad47
+++ b/app/spider/__pycache__/guduo_spider.cpython-310.pyc deleted 100644 → 0
View file @828ad47
--- a/app/spider/guduo_spider.py
View file @39153b1
+++ b/app/spider/guduo_spider.py
View file @39153b1
@@ -6,6 +6,7 @@ from playwright.async_api import Page, async_playwright
 from superstream import Stream
 from app.schemas.spider_schema import GuoDuoSpiderResult, SpiderParams, TypeEnum
 import logging
+from tqdm.asyncio import tqdm

 logger = logging.getLogger(__name__)

@@ -17,34 +18,41 @@ async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]:
    async with async_playwright() as p:
        browser = await p.firefox.launch(headless=True)
        # 创建指定个数的浏览器页面
-        pages = await asyncio.gather(*(browser.new_page()
-                                       for _ in range(len(time_range))))
+        page_tasks = (browser.new_page() for _ in range(len(time_range)))
+        pages_generator = tqdm.as_completed(page_tasks)
+        pages = list(pages_generator)
        # 同时循环time_range与pages 去调用hand_one_data异步方法
-        results = await asyncio.gather(*(hand_one_data(time_range[i], pages[i],
-                                                       scrawl_types, url)
-                                         for i in range(len(time_range))))
-        return (Stream(results)
+        hand_result_tasks = (
+            hand_one_data(time_range[i], pages[i], scrawl_types, url)
+            for i in range(len(time_range))
+        )
+        results__generator = tqdm.as_completed(hand_result_tasks)
+        results = list(results__generator)
+        return (
+            Stream(results)
            .filter(lambda x: x is not None)
            .filter(lambda x: len(x) > 0)
            .flat_map(lambda x: Stream(x))
-                .to_list())
+            .to_list()
+        )


-async def hand_one_data(targetDate: str,
-                        page: Page,
-                        scrawl_types: List[TypeEnum],
-                        url: str) -> List[GuoDuoSpiderResult]:
+async def hand_one_data(
+        targetDate: str, page: Page, scrawl_types: List[TypeEnum], url: str
+) -> List[GuoDuoSpiderResult]:
    # 把date这个日期对象解析为年、月、日
-    year, month, day = targetDate.split('-')
-    day = day.replace('0', '')
+    year, month, day = targetDate.split("-")
+    day = day.replace("0", "")
    month = int(month) - 1
    await page.goto(url)
    # 等待遮罩层消失
    await page.click('//*[@id="currentDateTxt"]')
-    data_css_selector = (f'#day span.pika-button.pika-day[type="button"]'
+    data_css_selector = (
+        f'#day span.pika-button.pika-day[type="button"]'
        f'[data-pika-year="{year}"]'
        f'[data-pika-month="{month}"]'
-                         f'[data-pika-day="{day}"]')
+        f'[data-pika-day="{day}"]'
+    )
    doc = page.locator(data_css_selector)
    # 判断指定元素是否存在如果不存在就返回空的[]
    if not await doc.is_visible():
@@ -52,12 +60,14 @@ async def hand_one_data(targetDate: str,
    # 点击指定日期
    await doc.click()
    # 最后一步修正定位
-    css_selectors = (Stream(scrawl_types)
+    css_selectors = (
+        Stream(scrawl_types)
        .map(lambda x: (x, x.get_precise_positioning(x)))
-                     .group_by(lambda x: x[0]))
+        .group_by(lambda x: x[0])
+    )
    results = []
    for key, value in css_selectors.items():
-        logger.info(f'开始爬取 {targetDate} 类型是{value[0][0]} 的数据')
+        logger.info(f"开始爬取 {targetDate} 类型是{value[0][0]} 的数据")
        for css_selector in value[0][1]:
            await page.click(css_selector)
        result = await get_data(page, key.value, targetDate)
@@ -66,39 +76,49 @@ async def hand_one_data(targetDate: str,
    return results


-async def get_data(targetPage: Page, scrawl_type: int, target_time: str) -> Optional[List[GuoDuoSpiderResult]]:
-    target_data_css_selector = '.rank-box'
+async def get_data(
+        targetPage: Page, scrawl_type: int, target_time: str
+) -> Optional[List[GuoDuoSpiderResult]]:
+    target_data_css_selector = ".rank-box"
    table = targetPage.locator(target_data_css_selector)
    if table is None:
-        print(f'当前页面获取table为空:{scrawl_type}')
+        logger.info(f"当前页面获取table为空:{scrawl_type}")
        return None
    # 获取当前这个doc_obj元素下面所有的tr标签
-    trs = table.locator('tr')
+    trs = table.locator("tr")
    if trs is None:
-        print(f'当前页面获取tr为空:{scrawl_type}')
+        logger.info(f"当前页面获取tr为空:{scrawl_type}")
    # 循环遍历trs下面的每个tr标签下面的td标签
    at = await trs.all_text_contents()
-    result = (Stream(at)
-              .filter(lambda x: x.strip() != '')
-              .filter(lambda x: len(x.split(' ')) > 6)
-              .map(lambda x: GuoDuoSpiderResult(time=target_time,
+    result = (
+        Stream(at)
+        .filter(lambda x: x.strip() != "")
+        .filter(lambda x: len(x.split(" ")) > 6)
+        .map(
+            lambda x: GuoDuoSpiderResult(
+                time=target_time,
                targetType=TypeEnum(scrawl_type),
                createTime=datetime.now(),
-                                                platform='all',
-                                                score=float(x.split(' ')[-1]),
-                                                sourceName=x.split(' ')[5]))
-              .to_list())
+                platform="all",
+                score=float(x.split(" ")[-1]),
+                sourceName=x.split(" ")[5],
+            )
+        )
+        .to_list()
+    )
    return result


-if __name__ == '__main__':
+if __name__ == "__main__":
    # 获取程序开始执行的时间
    start_time = time.time()
-    param = SpiderParams(startDate='1991-01-02',
-                         endDate='1991-01-05',
-                         target_type=[TypeEnum.ANIME, TypeEnum.EPISODES, TypeEnum.MOVIE]
-                         , url='http://guduodata.com/')
+    param = SpiderParams(
+        startDate="1991-01-02",
+        endDate="1991-01-05",
+        target_type=[TypeEnum.ANIME, TypeEnum.EPISODES, TypeEnum.MOVIE],
+        url="http://guduodata.com/",
+    )
    res = asyncio.run(startBrowser(param))
-    print(f'程序执行耗时时间：{time.time() - start_time} 长度为：{len(res)}')
+    print(f"程序执行耗时时间：{time.time() - start_time} 长度为：{len(res)}")
    for it in res:
        print(it)
--- a/requirements.txt
View file @39153b1
+++ b/requirements.txt
View file @39153b1
@@ -21,6 +21,7 @@ sniffio==1.3.1
 starlette==0.41.3
 SuperStream==0.2.6
 tortoise-orm==0.22.2
+tqdm==4.67.1
 typing_extensions==4.12.2
 tzlocal==5.2
 uvicorn==0.34.0