bug修复

文鑫
Commit 4c03d0d2 ... 4c03d0d2ad43b5a1b64266e4a6c5b2313a0a84f3 authored 2024-12-22 13:47:22 +0800 by 文鑫
Showing 9 changed files with 186 additions and 78 deletions
.idea/FastAPIProject1.iml
.idea/misc.xml
.vscode/launch.json
.vscode/settings.json
app/main.py
app/schemas/spider_schema.py
app/service/spider_job_service.py
app/spider/guduo_spider.py
requirements.txt
--- a/.idea/FastAPIProject1.iml
View file @4c03d0d
+++ b/.idea/FastAPIProject1.iml
View file @4c03d0d
@@ -2,7 +2,7 @@
 <module type="PYTHON_MODULE" version="4">
  <component name="NewModuleRootManager">
    <content url="file://$MODULE_DIR$" />
-    <orderEntry type="jdk" jdkName="Spider" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="spider" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
  <component name="PackageRequirementsSettings">
--- a/.idea/misc.xml
View file @4c03d0d
+++ b/.idea/misc.xml
View file @4c03d0d
@@ -3,4 +3,5 @@
  <component name="Black">
    <option name="sdkName" value="Spider" />
  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="spider" project-jdk-type="Python SDK" />
 </project>
\ No newline at end of file
--- a/.vscode/launch.json 0 → 100644
View file @4c03d0d
+++ b/.vscode/launch.json 0 → 100644
View file @4c03d0d
+{
+  "version": "0.2.0",
+  "configurations": []
+}
--- a/.vscode/settings.json 0 → 100644
View file @4c03d0d
+++ b/.vscode/settings.json 0 → 100644
View file @4c03d0d
+{
+  "files.exclude": {
+    "__pycache__": true,
+    "**/*.pyc": true,
+    "**/*.pyo": true,
+    "**/*.pyd": true
+  }
+}
--- a/app/main.py
View file @4c03d0d
+++ b/app/main.py
View file @4c03d0d
 import logging
 import os
 import sys
 # 把当前文件所在文件夹的父文件夹路径加入到PYTHONPATH 解决ModuleNotFoundError: No module named 'app'
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 import uvicorn
@@ -12,9 +13,9 @@ from app.config.log_config import getLogConfig
 app = FastAPI(lifespan=lifespan)
 app.include_router(spider_router.spiderRouter)
-if __name__ == '__main__':
+if __name__ == "__main__":
    appConf = getAppConfig()
-    uvicorn.run("main:app", host="0.0.0.0",
+    uvicorn.run(
-                port=appConf.server.port,
+        "main:app", host="0.0.0.0", port=appConf.server.port, log_config=getLogConfig()
-                log_config=getLogConfig())
+    )
    logging.info(f"启动成功->{appConf.server}")
--- a/app/schemas/spider_schema.py
View file @4c03d0d
+++ b/app/schemas/spider_schema.py
View file @4c03d0d
@@ -10,24 +10,32 @@ class TypeEnum(IntEnum):
    EPISODES = 1  # 剧集
    MOVIE = 2  # 电影
    ANIME = 3  # 动漫
+    VARIETY = 4  # 综艺
    def get_precise_positioning(self, other) -> List[str]:
        if self.ANIME == other:
-            return ['.cateIcon_5',
+            return [".cateIcon_5", "li.anime-platform:nth-child(1)"]
-                    'li.anime-platform:nth-child(1)']
        if self.MOVIE == other:
-            return ['.cateIcon_3',
+            return [".cateIcon_3", "ul.type-box:nth-child(1) > li:nth-child(1)"]
-                    'ul.type-box:nth-child(1) > li:nth-child(1)']
        if self.EPISODES == other:
-            return ['.cateIcon_1',
+            return [
-                    'div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)',
+                ".cateIcon_1",
-                    'ul.type-box:nth-child(1) > li:nth-child(1)']
+                "div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)",
+                "ul.type-box:nth-child(1) > li:nth-child(1)",
+            ]
+        if self.VARIETY == other:
+            return [
+                "div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)",
+                "ul.type-box:nth-child(1) > li:nth-child(1)",
+            ]
 class GuoDuoSpiderResult(BaseModel):
    # time 数据时间 yyyy-MM-dd 格式的字符窜
-    time: str = Field(..., description="数据时间 yyyy-MM-dd 格式的字符", pattern='^\d{4}-\d{2}-\d{2}$')
+    time: str = Field(
-    # type  '1剧集 2电影 3 动漫',
+        ..., description="数据时间 yyyy-MM-dd 格式的字符", pattern="^\d{4}-\d{2}-\d{2}$"
+    )
+    # type  '1剧集 2电影 3 动漫 4综艺',
    targetType: TypeEnum = Field(..., description="数据类型")
    # platform  平台名字
    platform: str
@@ -43,8 +51,12 @@ class GuoDuoSpiderResult(BaseModel):
 class SpiderParams(BaseModel):
-    startDate: str = Field(..., description="数据时间 yyyy-MM-dd 格式的字符", pattern='^\d{4}-\d{2}-\d{2}$')
+    startDate: str = Field(
-    endDate: str = Field(..., description="数据时间 yyyy-MM-dd 格式的字符", pattern='^\d{4}-\d{2}-\d{2}$')
+        ..., description="数据时间 yyyy-MM-dd 格式的字符", pattern="^\d{4}-\d{2}-\d{2}$"
+    )
+    endDate: str = Field(
+        ..., description="数据时间 yyyy-MM-dd 格式的字符", pattern="^\d{4}-\d{2}-\d{2}$"
+    )
    # type  '1剧集 2电影 3 动漫',
    target_type: List[TypeEnum]
    # 爬取的地址
@@ -57,12 +69,12 @@ class SpiderParams(BaseModel):
        start_date = self.startDate
        end_date = self.endDate
        time_range = []
-        start_date_obj = datetime.strptime(start_date, '%Y-%m-%d')
+        start_date_obj = datetime.strptime(start_date, "%Y-%m-%d")
-        end_date_obj = datetime.strptime(end_date, '%Y-%m-%d')
+        end_date_obj = datetime.strptime(end_date, "%Y-%m-%d")
        delta = end_date_obj - start_date_obj
        for i in range(delta.days + 1):
            day = start_date_obj + timedelta(days=i)
-            time_range.append(day.strftime('%Y-%m-%d'))
+            time_range.append(day.strftime("%Y-%m-%d"))
        return time_range
--- a/app/service/spider_job_service.py
View file @4c03d0d
+++ b/app/service/spider_job_service.py
View file @4c03d0d
@@ -5,7 +5,12 @@ from app.model.mysql_model import SpiderModel
 from superstream import Stream
 from app.job.job import scheduler
-from app.schemas.spider_schema import ApschedulerJob, TaskInfo, SpiderParams, GuoDuoSpiderResult
+from app.schemas.spider_schema import (
+    ApschedulerJob,
+    TaskInfo,
+    SpiderParams,
+    GuoDuoSpiderResult,
+)
 from app.spider.guduo_spider import startBrowser
 logger = logging.getLogger(__name__)
@@ -16,21 +21,29 @@ def add_job(params: List[Any], taskInfo: TaskInfo, task: Any):
    cron = taskInfo.executeStrategy
    cron_parameterization = cron.split(" ")
    minute = cron_parameterization[1]
-    minute = (None if minute == "?" else minute)
+    minute = None if minute == "?" else minute
    hour = cron_parameterization[2]
-    hour = (None if hour == "?" else hour)
+    hour = None if hour == "?" else hour
    day = cron_parameterization[3]
-    day = (None if day == "?" else day)
+    day = None if day == "?" else day
    month = cron_parameterization[4]
-    month = (None if month == "?" else month)
+    month = None if month == "?" else month
    weekend = cron_parameterization[5]
-    weekend = (None if weekend == "?" else weekend)
+    weekend = None if weekend == "?" else weekend
-    job = scheduler.add_job(task, "cron",
+    job = scheduler.add_job(
-                            minute=minute, hour=hour, day=day, month=month, week=weekend,
+        task,
-                            args=params)
+        "cron",
+        minute=minute,
+        hour=hour,
+        day=day,
+        month=month,
+        week=weekend,
+        args=params,
+    )
    jobs.append(ApschedulerJob(jobId=job, taskId=taskInfo.taskId))
    logger.info(
-        f"添加任务成功,任务id:{taskInfo.taskId},任务执行策略:{taskInfo.executeStrategy}")
+        f"添加任务成功,任务id:{taskInfo.taskId},任务执行策略:{taskInfo.executeStrategy}"
+    )
 def add_guduo_job(taskParam: SpiderParams, taskInfo: TaskInfo):
@@ -52,18 +65,17 @@ def delete_job(taskId: int):
 def get_job_info(taskId: int):
-    job = (Stream(jobs)
+    job = Stream(jobs).filter(lambda x: x.taskId == taskId).find_first()
-           .filter(lambda x: x.taskId == taskId)
+    return f"job 信息->{job}"
-           .find_first())
-    return f'job 信息->{job}'
 async def scrawl_and_save(taskParam: SpiderParams):
    # 执行爬虫获取结果
    results = await startBrowser(taskParam)
+    logger.info(f"爬虫重试情况:{startBrowser.statistics}")
    asyncTasks = (save_or_update(item) for item in results)
    await asyncio.gather(*asyncTasks)
-    logger.info(f'爬虫任务执行完成,爬取到数据{len(results)}条 保存到数据库完成')
+    logger.info(f"爬虫任务执行完成,爬取到数据{len(results)}条 保存到数据库完成")
 async def save_or_update(result: GuoDuoSpiderResult):
@@ -75,10 +87,7 @@ async def save_or_update(result: GuoDuoSpiderResult):
    createTime = result.createTime
    # 使用SpiderModel 查询 time targetType platform sourceName 等于以上的值是否存在如果存在就更新不存在就插入
    obj = await SpiderModel.get_or_none(
-        time=time,
+        time=time, targetType=targetType, platform=platform, sourceName=sourceName
-        targetType=targetType,
-        platform=platform,
-        sourceName=sourceName
    )
    if obj:
        for key, value in result.__dict__.items():
@@ -90,6 +99,6 @@ async def save_or_update(result: GuoDuoSpiderResult):
            platform=platform,
            sourceName=sourceName,
            score=score,
-            createTime=createTime
+            createTime=createTime,
        )
    await obj.save()
--- a/app/spider/guduo_spider.py
View file @4c03d0d
+++ b/app/spider/guduo_spider.py
View file @4c03d0d
@@ -7,10 +7,24 @@ from superstream import Stream
 from app.schemas.spider_schema import GuoDuoSpiderResult, SpiderParams, TypeEnum
 import logging
 from tqdm.asyncio import tqdm_asyncio
+from tenacity import (
+    after_log,
+    before_sleep_log,
+    retry,
+    stop_after_attempt,
+    wait_exponential,
+    wait_fixed,
+)
 logger = logging.getLogger(__name__)
+@retry(
+    stop=stop_after_attempt(3),
+    before_sleep=before_sleep_log(logger, logging.DEBUG),
+    wait=wait_exponential(multiplier=1, min=1, max=10),
+    after=after_log(logger, logging.DEBUG),
+)
 async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]:
    time_range = spiderParam.get_time_range()
    url = spiderParam.url
@@ -18,34 +32,41 @@ async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]:
    async with async_playwright() as p:
        browser = await p.firefox.launch(headless=True)
        # 创建指定个数的浏览器页面
-        pages = await tqdm_asyncio.gather(*(browser.new_page()
+        pages = await tqdm_asyncio.gather(
-                                       for _ in range(len(time_range))))
+            *(browser.new_page() for _ in range(len(time_range)))
+        )
        # 同时循环time_range与pages 去调用hand_one_data异步方法
-        results = await tqdm_asyncio.gather(*(hand_one_data(time_range[i], pages[i],
+        results = await tqdm_asyncio.gather(
-                                                       scrawl_types, url)
+            *(
-                                         for i in range(len(time_range))))
+                hand_one_data(time_range[i], pages[i], scrawl_types, url)
-        return (Stream(results)
+                for i in range(len(time_range))
+            )
+        )
+        return (
+            Stream(results)
            .filter(lambda x: x is not None)
            .filter(lambda x: len(x) > 0)
            .flat_map(lambda x: Stream(x))
-                .to_list())
+            .to_list()
+        )
-async def hand_one_data(targetDate: str,
+async def hand_one_data(
-                        page: Page,
+    targetDate: str, page: Page, scrawl_types: List[TypeEnum], url: str
-                        scrawl_types: List[TypeEnum],
+) -> List[GuoDuoSpiderResult]:
-                        url: str) -> List[GuoDuoSpiderResult]:
    # 把date这个日期对象解析为年、月、日
-    year, month, day = targetDate.split('-')
+    year, month, day = targetDate.split("-")
-    day = day.replace('0', '')
+    day = day.replace("0", "")
    month = int(month) - 1
    await page.goto(url)
    # 等待遮罩层消失
    await page.click('//*[@id="currentDateTxt"]')
-    data_css_selector = (f'#day span.pika-button.pika-day[type="button"]'
+    data_css_selector = (
+        f'#day span.pika-button.pika-day[type="button"]'
        f'[data-pika-year="{year}"]'
        f'[data-pika-month="{month}"]'
-                         f'[data-pika-day="{day}"]')
+        f'[data-pika-day="{day}"]'
+    )
    doc = page.locator(data_css_selector)
    # 判断指定元素是否存在如果不存在就返回空的[]
    if not await doc.is_visible():
@@ -53,12 +74,14 @@ async def hand_one_data(targetDate: str,
    # 点击指定日期
    await doc.click()
    # 最后一步修正定位
-    css_selectors = (Stream(scrawl_types)
+    css_selectors = (
+        Stream(scrawl_types)
        .map(lambda x: (x, x.get_precise_positioning(x)))
-                     .group_by(lambda x: x[0]))
+        .group_by(lambda x: x[0])
+    )
    results = []
    for key, value in css_selectors.items():
-        logger.info(f'开始爬取 {targetDate} 类型是{value[0][0]} 的数据')
+        logger.info(f"开始爬取 {targetDate} 类型是{value[0][0]} 的数据")
        for css_selector in value[0][1]:
            await page.click(css_selector)
        result = await get_data(page, key.value, targetDate)
@@ -67,39 +90,53 @@ async def hand_one_data(targetDate: str,
    return results
-async def get_data(targetPage: Page, scrawl_type: int, target_time: str) -> Optional[List[GuoDuoSpiderResult]]:
+async def get_data(
-    target_data_css_selector = '.rank-box'
+    targetPage: Page, scrawl_type: int, target_time: str
+) -> Optional[List[GuoDuoSpiderResult]]:
+    target_data_css_selector = ".rank-box"
    table = targetPage.locator(target_data_css_selector)
    if table is None:
-        print(f'当前页面获取table为空:{scrawl_type}')
+        print(f"当前页面获取table为空:{scrawl_type}")
        return None
    # 获取当前这个doc_obj元素下面所有的tr标签
-    trs = table.locator('tr')
+    trs = table.locator("tr")
    if trs is None:
-        print(f'当前页面获取tr为空:{scrawl_type}')
+        print(f"当前页面获取tr为空:{scrawl_type}")
    # 循环遍历trs下面的每个tr标签下面的td标签
    at = await trs.all_text_contents()
-    result = (Stream(at)
+    result = (
-              .filter(lambda x: x.strip() != '')
+        Stream(at)
-              .filter(lambda x: len(x.split(' ')) > 6)
+        .filter(lambda x: x.strip() != "")
-              .map(lambda x: GuoDuoSpiderResult(time=target_time,
+        .filter(lambda x: len(x.split(" ")) > 6)
+        .map(
+            lambda x: GuoDuoSpiderResult(
+                time=target_time,
                targetType=TypeEnum(scrawl_type),
                createTime=datetime.now(),
-                                                platform='all',
+                platform="all",
-                                                score=float(x.split(' ')[-1]),
+                score=float(x.split(" ")[-1]),
-                                                sourceName=x.split(' ')[5]))
+                sourceName=x.split(" ")[5],
-              .to_list())
+            )
+        )
+        .to_list()
+    )
    return result
-if __name__ == '__main__':
+if __name__ == "__main__":
    # 获取程序开始执行的时间
    start_time = time.time()
-    param = SpiderParams(startDate='2024-01-02',
+    param = SpiderParams(
-                         endDate='2024-01-15',
+        startDate="2024-01-02",
-                         target_type=[TypeEnum.ANIME, TypeEnum.EPISODES, TypeEnum.MOVIE]
+        endDate="2024-01-15",
-                         , url='http://guduodata.com/')
+        target_type=[
+            TypeEnum.ANIME,
+            TypeEnum.EPISODES,
+            TypeEnum.MOVIE,
+            TypeEnum.VARIETY,
+        ],
+        url="http://guduodata.com/",
+    )
    res = asyncio.run(startBrowser(param))
-    print(f'程序执行耗时时间：{(time.time() - start_time)/1000}长度为：{len(res)}')
+    print(f"代码重试情况：{startBrowser.statistics}")
-    for it in res:
+    print(f"程序执行耗时时间：{(time.time() - start_time)/1000}长度为：{len(res)}")
-        print(it)
--- a/requirements.txt 0 → 100644
View file @4c03d0d
+++ b/requirements.txt 0 → 100644
View file @4c03d0d
+aiosqlite==0.20.0
+annotated-types==0.7.0
+anyio==4.7.0
+APScheduler==3.11.0
+asyncmy==0.2.10
+click==8.1.7
+colorama==0.4.6
+distlib==0.3.8
+exceptiongroup==1.2.2
+fastapi==0.115.6
+greenlet==3.1.1
+h11==0.14.0
+idna==3.10
+iso8601==2.1.0
+pandoc==2.3
+playwright==1.49.1
+plumbum==1.8.3
+ply==3.11
+pydantic==2.10.4
+pydantic_core==2.27.2
+pyee==12.0.0
+pypika-tortoise==0.3.2
+pytz==2024.2
+pywin32==306
+PyYAML==6.0.2
+sniffio==1.3.1
+starlette==0.41.3
+SuperStream==0.2.6
+tenacity==9.0.0
+tortoise-orm==0.22.2
+tqdm==4.67.1
+typing_extensions==4.12.2
+tzdata==2024.2
+tzlocal==5.2
+uvicorn==0.34.0
+virtualenv==20.25.0