Commit 78f7d02c 78f7d02cef4999f3a981e1636e3627252679bcdd by wenxin

init

0 parents
Showing 52 changed files with 732 additions and 0 deletions
1 # 默认忽略的文件
2 /shelf/
3 /workspace.xml
4 # 基于编辑器的 HTTP 客户端请求
5 /httpRequests/
6 # Datasource local storage ignored files
7 /dataSources/
8 /dataSources.local.xml
1 <?xml version="1.0" encoding="UTF-8"?>
2 <module type="PYTHON_MODULE" version="4">
3 <component name="NewModuleRootManager">
4 <content url="file://$MODULE_DIR$" />
5 <orderEntry type="jdk" jdkName="Spider" jdkType="Python SDK" />
6 <orderEntry type="sourceFolder" forTests="false" />
7 </component>
8 <component name="PackageRequirementsSettings">
9 <option name="modifyBaseFiles" value="true" />
10 </component>
11 </module>
...\ No newline at end of file ...\ No newline at end of file
1 <?xml version="1.0" encoding="UTF-8"?>
2 <project version="4">
3 <component name="DataSourceManagerImpl" format="xml" multifile-model="true">
4 <data-source source="LOCAL" name="docker" uuid="4b6d465a-8a9f-4d96-bc3c-4d40505aae14">
5 <driver-ref>mysql.8</driver-ref>
6 <synchronize>true</synchronize>
7 <jdbc-driver>com.mysql.cj.jdbc.Driver</jdbc-driver>
8 <jdbc-url>jdbc:mysql://localhost:3306</jdbc-url>
9 <working-dir>$ProjectFileDir$</working-dir>
10 </data-source>
11 </component>
12 </project>
...\ No newline at end of file ...\ No newline at end of file
1 <component name="InspectionProjectProfileManager">
2 <profile version="1.0">
3 <option name="myName" value="Project Default" />
4 <inspection_tool class="HttpUrlsUsage" enabled="true" level="WEAK WARNING" enabled_by_default="true">
5 <option name="ignoredUrls">
6 <list>
7 <option value="http://0.0.0.0" />
8 <option value="http://127.0.0.1" />
9 <option value="http://activemq.apache.org/schema/" />
10 <option value="http://cxf.apache.org/schemas/" />
11 <option value="http://guduodata.com" />
12 <option value="http://java.sun.com/" />
13 <option value="http://javafx.com/fxml" />
14 <option value="http://javafx.com/javafx/" />
15 <option value="http://json-schema.org/draft" />
16 <option value="http://localhost" />
17 <option value="http://maven.apache.org/POM/" />
18 <option value="http://maven.apache.org/xsd/" />
19 <option value="http://primefaces.org/ui" />
20 <option value="http://schema.cloudfoundry.org/spring/" />
21 <option value="http://schemas.xmlsoap.org/" />
22 <option value="http://tiles.apache.org/" />
23 <option value="http://www.ibm.com/webservices/xsd" />
24 <option value="http://www.jboss.com/xml/ns/" />
25 <option value="http://www.jboss.org/j2ee/schema/" />
26 <option value="http://www.springframework.org/schema/" />
27 <option value="http://www.springframework.org/security/tags" />
28 <option value="http://www.springframework.org/tags" />
29 <option value="http://www.thymeleaf.org" />
30 <option value="http://www.w3.org/" />
31 <option value="http://xmlns.jcp.org/" />
32 </list>
33 </option>
34 </inspection_tool>
35 <inspection_tool class="PyArgumentListInspection" enabled="true" level="INFORMATION" enabled_by_default="true" />
36 <inspection_tool class="PyMethodMayBeStaticInspection" enabled="false" level="WEAK WARNING" enabled_by_default="false" />
37 <inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
38 <option name="ignoredErrors">
39 <list>
40 <option value="E402" />
41 <option value="W605" />
42 <option value="E501" />
43 </list>
44 </option>
45 </inspection_tool>
46 <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
47 <option name="ignoredErrors">
48 <list>
49 <option value="N803" />
50 <option value="N802" />
51 <option value="N801" />
52 <option value="N806" />
53 <option value="N813" />
54 </list>
55 </option>
56 </inspection_tool>
57 <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
58 <option name="ignoredIdentifiers">
59 <list>
60 <option value="tensorflow" />
61 <option value="str.get" />
62 <option value="admin.Admin._Base__readUsers" />
63 <option value="admin.Admin._Base__changeActive" />
64 <option value="admin.Admin._Base__changeRole" />
65 <option value="admin.Admin._Base__writeGifts" />
66 <option value="admin.Admin._Base__reduceGift" />
67 <option value="user.User._Base__readUsers" />
68 <option value="user.User._Base__readGifts" />
69 <option value="urllib.request" />
70 </list>
71 </option>
72 </inspection_tool>
73 <inspection_tool class="PydanticInspection" enabled="false" level="WARNING" enabled_by_default="false" />
74 </profile>
75 </component>
...\ No newline at end of file ...\ No newline at end of file
1 <component name="InspectionProjectProfileManager">
2 <settings>
3 <option name="USE_PROJECT_PROFILE" value="false" />
4 <version value="1.0" />
5 </settings>
6 </component>
...\ No newline at end of file ...\ No newline at end of file
1 <?xml version="1.0" encoding="UTF-8"?>
2 <project version="4">
3 <component name="Black">
4 <option name="sdkName" value="Spider" />
5 </component>
6 </project>
...\ No newline at end of file ...\ No newline at end of file
1 <?xml version="1.0" encoding="UTF-8"?>
2 <project version="4">
3 <component name="ProjectModuleManager">
4 <modules>
5 <module fileurl="file://$PROJECT_DIR$/.idea/FastAPIProject1.iml" filepath="$PROJECT_DIR$/.idea/FastAPIProject1.iml" />
6 </modules>
7 </component>
8 </project>
...\ No newline at end of file ...\ No newline at end of file
1 <?xml version="1.0" encoding="UTF-8"?>
2 <project version="4">
3 <component name="VcsDirectoryMappings" defaultProject="true" />
4 </project>
...\ No newline at end of file ...\ No newline at end of file
No preview for this file type
1 server:
2 port: 7654
3 tortoise:
4 connections:
5 default:
6 engine: tortoise.backends.mysql
7 credentials:
8 database: fast_api
9 host: 127.0.0.1
10 password: root
11 port: 3306
12 user: root
13 minsize: 10
14 maxsize: 200
15 connect_timeout: 30
16 echo: true
17 apps:
18 models_read:
19 models:
20 - app.model.mysql_model
21 default_connection: default
22 log_queries: true # 启用日志查询
File mode changed
1 import os
2 import yaml
3
4 from app.schemas.config_schema import AppConfig
5
6 import logging
7
8 logger = logging.getLogger(__name__)
9
10
11 def getAppConfig():
12 logger.info('开始加载AppConfig')
13 # 获取当前文件的绝对路径
14 current_file_path = os.path.abspath(__file__)
15 # 获取当前文件的上级目录
16 parent_directory = os.path.dirname(current_file_path)
17 # 构建application.yaml文件的路径
18 application_yaml_path = os.path.join(parent_directory, '..', 'application.yaml')
19 with open(application_yaml_path, 'r') as f:
20 log_config = yaml.safe_load(f)
21 return AppConfig(**log_config)
1 import functools
2 import logging.config
3 import os
4 from typing import Any
5
6 import yaml
7
8
9 @functools.lru_cache()
10 def getLogConfig() -> dict[str, Any]:
11 current_file_path = os.path.abspath(__file__)
12 # 获取当前文件的上级目录
13 parent_directory = os.path.dirname(current_file_path)
14 # 构建application.yaml文件的路径
15 application_yaml_path = os.path.join(parent_directory, '..', 'log-config.yaml')
16 # 加载日志配置
17 with open(application_yaml_path, 'r') as f:
18 log_config = yaml.safe_load(f)
19 return log_config
20
21
22 # 配置根日志记录器以供其他模块使用
23 logging.config.dictConfig(getLogConfig())
1 import os
2 import yaml
3 from app.schemas.config_schema import TortoiseConfig
4 import logging
5
6 logger = logging.getLogger(__name__)
7
8
9 def getTortoiseConfig():
10 logger.info('开始加载TortoiseConfig')
11 # 获取当前文件的绝对路径
12 current_file_path = os.path.abspath(__file__)
13 # 获取当前文件的上级目录
14 parent_directory = os.path.dirname(current_file_path)
15 # 构建application.yaml文件的路径
16 application_yaml_path = os.path.join(parent_directory, '..', 'application.yaml')
17 with open(application_yaml_path, 'r') as f:
18 log_config = yaml.safe_load(f)
19 return TortoiseConfig(**(log_config.get('tortoise')))
File mode changed
No preview for this file type
1 from contextlib import asynccontextmanager
2
3 from tortoise import Tortoise
4 from app.job.job import scheduler
5 from fastapi import FastAPI
6 from app.config.tortoise_config import getTortoiseConfig
7 import logging
8
9 logger = logging.getLogger(__name__)
10
11
12 async def init():
13 # 初始化链接
14 logger.info("开始初始化数据库")
15 await Tortoise.init(config=getTortoiseConfig().dict())
16 # 生成数据库表数据
17 logger.info("开始生成数据库表")
18 await Tortoise.generate_schemas()
19
20
21 async def close():
22 await Tortoise.close_connections()
23
24
25 # 使用asynccontextmanager装饰器定义一个异步上下文管理器函数lifespan
26 @asynccontextmanager
27 async def lifespan(app: FastAPI):
28 # 开始apscheduler
29 scheduler.start()
30 logging.info("apscheduler启动完成")
31 await init()
32 logging.info("初始化数据库完成")
33 yield
34 # 在异步上下文管理器中,"退出上下文"时清理机器学习模型,释放资源
35 scheduler.shutdown()
36 logging.info("apscheduler关闭完成")
37 await close()
38 logging.info("关闭数据库完成")
File mode changed
No preview for this file type
No preview for this file type
1 from datetime import datetime
2
3 from apscheduler.schedulers.asyncio import AsyncIOScheduler
4
5
6 # 导入asynccontextmanager用于创建异步上下文管理器
7
8 # 创建一个scheduler实例
9 scheduler = AsyncIOScheduler()
10
11
12 # 每分钟执行的定时任务
13 @scheduler.scheduled_job('interval', minutes=1)
14 async def cron_job():
15 # 执行任务的内容,例如打印当前时间
16 print(f"The current time is {datetime.now()}")
17
18
19
1 version: 1
2 disable_existing_loggers: false
3
4 formatters:
5 standard:
6 format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
7
8 handlers:
9 console:
10 class: logging.StreamHandler
11 level: INFO
12 formatter: standard
13 stream: ext://sys.stderr
14 file:
15 class: logging.handlers.TimedRotatingFileHandler
16 level: INFO
17 formatter: standard
18 filename: 'logs/app.log' # 日志文件的路径和名称
19 when: midnight # 按天分割
20 interval: 1 # 每天分割一次
21 backupCount: 7 # 保留最近7天的日志文件
22 encoding: utf8 # 文件编码
23
24 loggers:
25 '':
26 handlers: ['console', 'file'] # 使用两个处理器:控制台和文件
27 level: INFO
28 propagate: true
This diff could not be displayed because it is too large.
1 import logging
2
3 import uvicorn
4 from fastapi import FastAPI
5 from app.router import spider_router
6 from app.config.app_config import getAppConfig
7 from app.event.fastapi_event import lifespan
8 from app.config.log_config import getLogConfig
9
10 app = FastAPI(lifespan=lifespan)
11 app.include_router(spider_router.spiderRouter)
12 if __name__ == '__main__':
13 appConf = getAppConfig()
14 uvicorn.run("main:app", host="0.0.0.0",
15 port=appConf.server.port,
16 log_config=getLogConfig())
17 logging.info(f"启动成功->{appConf.server}")
File mode changed
No preview for this file type
1 from datetime import datetime
2
3 from tortoise import Model, fields
4
5 from app.schemas.spider_schema import TypeEnum
6
7
8 class SpiderModel(Model):
9 id = fields.IntField(pk=True)
10 time = fields.CharField(max_length=10, description="数据时间 yyyy-MM-dd 格式的字符", regex=r'^\d{4}-\d{2}-\d{2}$')
11 targetType = fields.IntEnumField(TypeEnum, description="数据类型", source_field='target_type')
12 platform = fields.CharField(max_length=255, description="平台名字")
13 sourceName = fields.CharField(max_length=255, description="媒体资源名字", source_field='source_name')
14 score = fields.FloatField(description="热度得分")
15 createTime = fields.DatetimeField(default=datetime.now, description="创建时间", source_field='create_time')
16
17 class Meta:
18 table = "spider_data"
File mode changed
1 # router/users.py
2 import logging
3 from typing import Dict, Any
4
5 from fastapi import APIRouter, Path
6
7 from app.schemas.spider_schema import SpiderParams, TaskInfo
8 from app.service.spider_job_service import add_job, get_job_info, add_guduo_job
9 from app.spider import guduo_spider
10
11 logger = logging.getLogger(__name__)
12 spiderRouter = APIRouter(prefix="/spider", tags=["spider"])
13
14
15 @spiderRouter.post("/start/", summary="启动一个爬虫在后台运行")
16 async def read_users(param: SpiderParams):
17 logger.info(f"开始执行怕从参数是:{param}")
18 res = await guduo_spider.startBrowser(param)
19 logger.info("爬取数据完成")
20 return res
21
22
23 @spiderRouter.post("/add_guduo_job", summary="添加一个骨朵爬虫定时任务")
24 async def add_task_to_job(param: Dict[str, Any]):
25 jobParam = SpiderParams(**(param.get('taskParam')))
26 taskInfo = TaskInfo(**(param.get('taskInfo')))
27 add_guduo_job(jobParam, taskInfo)
28 return '添加骨朵爬虫任务成功'
29
30
31 @spiderRouter.get("/add_job/{taskId}", summary="添加一个定时任务")
32 async def add_test_job(taskId: int = Path(..., description="要查询的任务的唯一标识符")):
33 async def test(name: str):
34 logger.info(f"测试定时任务执行->{name}")
35
36 taskInfo = TaskInfo(taskId=taskId, taskStart=True,
37 executeStrategy='0 0/1 * * * ?',
38 url='www.baidu.com')
39 add_job(['test'], taskInfo, test)
40
41
42 @spiderRouter.get("/get_job/{taskId}", summary="获取一个定时任务")
43 async def get_job(taskId: int = Path(...)):
44 return get_job_info(taskId)
File mode changed
1 import ssl
2 from typing import Dict, List, Optional, Any
3
4 from pydantic import BaseModel, Field
5
6
7 def get_ctx():
8 ctx = ssl.create_default_context()
9 # And in this example we disable validation...
10 # Please don't do this. Look at the official Python ``ssl`` module documentation
11 ctx.check_hostname = False
12 ctx.verify_mode = ssl.CERT_NONE
13 return ctx
14
15
16 class Credentials(BaseModel):
17 database: str
18 host: str
19 password: str
20 port: int
21 user: str
22 ssl: Any = Field(default_factory=get_ctx)
23 minsize: Optional[int] = None
24 maxsize: Optional[int] = None
25 connect_timeout: Optional[int] = None
26 echo: Optional[bool] = None
27
28
29 class Connection(BaseModel):
30 engine: str
31 credentials: Credentials
32
33
34 class App(BaseModel):
35 models: List[str]
36 default_connection: str
37
38
39 class TortoiseConfig(BaseModel):
40 connections: Dict[str, Connection]
41 apps: Dict[str, App]
42 log_queries: bool
43
44
45 class Port(BaseModel):
46 port: int
47
48
49 class AppConfig(BaseModel):
50 server: Port
1 from datetime import timedelta, datetime
2 from enum import IntEnum
3 from typing import List
4
5 from apscheduler.job import Job
6 from pydantic import BaseModel, Field
7
8
9 class TypeEnum(IntEnum):
10 EPISODES = 1 # 剧集
11 MOVIE = 2 # 电影
12 ANIME = 3 # 动漫
13
14 def get_precise_positioning(self, other) -> List[str]:
15 if self.ANIME == other:
16 return ['.cateIcon_5',
17 'li.anime-platform:nth-child(1)']
18 if self.MOVIE == other:
19 return ['.cateIcon_3',
20 'ul.type-box:nth-child(1) > li:nth-child(1)']
21 if self.EPISODES == other:
22 return ['.cateIcon_1',
23 'div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)',
24 'ul.type-box:nth-child(1) > li:nth-child(1)']
25
26
27 class GuoDuoSpiderResult(BaseModel):
28 # time 数据时间 yyyy-MM-dd 格式的字符窜
29 time: str = Field(..., description="数据时间 yyyy-MM-dd 格式的字符", pattern='^\d{4}-\d{2}-\d{2}$')
30 # type '1剧集 2电影 3 动漫',
31 targetType: TypeEnum = Field(..., description="数据类型")
32 # platform 平台名字
33 platform: str
34 # source_name 媒体资源名字
35 sourceName: str
36 # 热度得分
37 score: float
38 # 创建时间
39 createTime: datetime
40
41 class Config:
42 from_attributes = False
43
44
45 class SpiderParams(BaseModel):
46 startDate: str = Field(..., description="数据时间 yyyy-MM-dd 格式的字符", pattern='^\d{4}-\d{2}-\d{2}$')
47 endDate: str = Field(..., description="数据时间 yyyy-MM-dd 格式的字符", pattern='^\d{4}-\d{2}-\d{2}$')
48 # type '1剧集 2电影 3 动漫',
49 target_type: List[TypeEnum]
50 # 爬取的地址
51 url: str
52
53 class Config:
54 from_attributes = False
55
56 def get_time_range(self) -> List[str]:
57 start_date = self.startDate
58 end_date = self.endDate
59 time_range = []
60 start_date_obj = datetime.strptime(start_date, '%Y-%m-%d')
61 end_date_obj = datetime.strptime(end_date, '%Y-%m-%d')
62 delta = end_date_obj - start_date_obj
63 for i in range(delta.days + 1):
64 day = start_date_obj + timedelta(days=i)
65 time_range.append(day.strftime('%Y-%m-%d'))
66 return time_range
67
68
69 class ApschedulerJob(BaseModel):
70 jobId: Job
71 taskId: int
72
73 class Config:
74 arbitrary_types_allowed = True
75
76
77 class TaskInfo(BaseModel):
78 taskStart: bool
79 executeStrategy: str
80 taskId: int
81 url: str
File mode changed
1 import asyncio
2 import logging
3 from typing import List, Any
4 from app.model.mysql_model import SpiderModel
5 from superstream import Stream
6
7 from app.job.job import scheduler
8 from app.schemas.spider_schema import ApschedulerJob, TaskInfo, SpiderParams, GuoDuoSpiderResult
9 from app.spider.guduo_spider import startBrowser
10
11 logger = logging.getLogger(__name__)
12 jobs: List[ApschedulerJob] = []
13
14
15 def add_job(params: List[Any], taskInfo: TaskInfo, task: Any):
16 cron = taskInfo.executeStrategy
17 cron_parameterization = cron.split(" ")
18 minute = cron_parameterization[1]
19 minute = (None if minute == "?" else minute)
20 hour = cron_parameterization[2]
21 hour = (None if hour == "?" else hour)
22 day = cron_parameterization[3]
23 day = (None if day == "?" else day)
24 month = cron_parameterization[4]
25 month = (None if month == "?" else month)
26 weekend = cron_parameterization[5]
27 weekend = (None if weekend == "?" else weekend)
28 job = scheduler.add_job(task, "cron",
29 minute=minute, hour=hour, day=day, month=month, week=weekend,
30 args=params)
31 jobs.append(ApschedulerJob(jobId=job, taskId=taskInfo.taskId))
32 logger.info(
33 f"添加任务成功,任务id:{taskInfo.taskId},任务执行策略:{taskInfo.executeStrategy}")
34
35
36 def add_guduo_job(taskParam: SpiderParams, taskInfo: TaskInfo):
37 """
38 添加骨朵怕从任务到调度器中
39 :param taskParam: 骨朵爬虫参数
40 :param taskInfo: 任务信息
41 :return:
42 """
43 add_job([taskParam], taskInfo, scrawl_and_save)
44
45
46 def delete_job(taskId: int):
47 for job in jobs:
48 if job.taskId == taskId:
49 job.jobId.remove()
50 jobs.remove(job)
51 logger.info(f"删除任务成功,任务id:{taskId}")
52
53
54 def get_job_info(taskId: int):
55 job = (Stream(jobs)
56 .filter(lambda x: x.taskId == taskId)
57 .find_first())
58 return f'job 信息->{job}'
59
60
61 async def scrawl_and_save(taskParam: SpiderParams):
62 # 执行爬虫获取结果
63 results = await startBrowser(taskParam)
64 asyncTasks = (save_or_update(item) for item in results)
65 await asyncio.gather(*asyncTasks)
66 logger.info(f'爬虫任务执行完成,爬取到数据{len(results)}条 保存到数据库完成')
67
68
69 async def save_or_update(result: GuoDuoSpiderResult):
70 time = result.time
71 targetType = result.targetType
72 platform = result.platform
73 sourceName = result.sourceName
74 score = result.score
75 createTime = result.createTime
76 # 使用SpiderModel 查询 time targetType platform sourceName 等于以上的值是否存在如果存在就更新不存在就插入
77 obj = await SpiderModel.get_or_none(
78 time=time,
79 targetType=targetType,
80 platform=platform,
81 sourceName=sourceName
82 )
83 if obj:
84 for key, value in result.__dict__.items():
85 setattr(obj, key, value)
86 else:
87 obj = await SpiderModel.create(
88 time=time,
89 targetType=targetType,
90 platform=platform,
91 sourceName=sourceName,
92 score=score,
93 createTime=createTime
94 )
95 await obj.save()
File mode changed
1 import asyncio
2 import time
3 from datetime import datetime
4 from typing import List, Optional
5 from playwright.async_api import Page, async_playwright
6 from superstream import Stream
7 from app.schemas.spider_schema import GuoDuoSpiderResult, SpiderParams, TypeEnum
8 import logging
9
10 logger = logging.getLogger(__name__)
11
12
13 async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]:
14 time_range = spiderParam.get_time_range()
15 url = spiderParam.url
16 scrawl_types = spiderParam.target_type
17 async with async_playwright() as p:
18 browser = await p.firefox.launch(headless=False)
19 # 创建指定个数的浏览器页面
20 pages = await asyncio.gather(*(browser.new_page()
21 for _ in range(len(time_range))))
22 # 同时循环time_range与pages 去调用hand_one_data异步方法
23 results = await asyncio.gather(*(hand_one_data(time_range[i], pages[i],
24 scrawl_types, url)
25 for i in range(len(time_range))))
26 return (Stream(results)
27 .filter(lambda x: x is not None)
28 .filter(lambda x: len(x) > 0)
29 .flat_map(lambda x: Stream(x))
30 .to_list())
31
32
33 async def hand_one_data(targetDate: str,
34 page: Page,
35 scrawl_types: List[TypeEnum],
36 url: str) -> List[GuoDuoSpiderResult]:
37 # 把date这个日期对象解析为年、月、日
38 year, month, day = targetDate.split('-')
39 day = day.replace('0', '')
40 month = int(month) - 1
41 await page.goto(url)
42 # 等待遮罩层消失
43 await page.click('//*[@id="currentDateTxt"]')
44 data_css_selector = (f'#day span.pika-button.pika-day[type="button"]'
45 f'[data-pika-year="{year}"]'
46 f'[data-pika-month="{month}"]'
47 f'[data-pika-day="{day}"]')
48 doc = page.locator(data_css_selector)
49 # 点击指定日期
50 await doc.click()
51 # 最后一步修正定位
52 css_selectors = (Stream(scrawl_types)
53 .map(lambda x: (x, x.get_precise_positioning(x)))
54 .group_by(lambda x: x[0]))
55 results = []
56 for key, value in css_selectors.items():
57 logger.info(f'开始爬取 {targetDate} 类型是{value[0][0]} 的数据')
58 for css_selector in value[0][1]:
59 await page.click(css_selector)
60 result = await get_data(page, key.value, targetDate)
61 if result:
62 Stream(result).for_each(lambda x: results.append(x))
63 return results
64
65
66 async def get_data(targetPage: Page, scrawl_type: int, target_time: str) -> Optional[List[GuoDuoSpiderResult]]:
67 target_data_css_selector = '.rank-box'
68 table = targetPage.locator(target_data_css_selector)
69 if table is None:
70 print(f'当前页面获取table为空:{scrawl_type}')
71 return None
72 # 获取当前这个doc_obj元素下面所有的tr标签
73 trs = table.locator('tr')
74 if trs is None:
75 print(f'当前页面获取tr为空:{scrawl_type}')
76 # 循环遍历trs下面的每个tr标签下面的td标签
77 at = await trs.all_text_contents()
78 result = (Stream(at)
79 .filter(lambda x: x.strip() != '')
80 .filter(lambda x: len(x.split(' ')) > 6)
81 .map(lambda x: GuoDuoSpiderResult(time=target_time,
82 targetType=TypeEnum(scrawl_type),
83 createTime=datetime.now(),
84 platform='all',
85 score=float(x.split(' ')[-1]),
86 sourceName=x.split(' ')[5]))
87 .to_list())
88 return result
89
90
91 if __name__ == '__main__':
92 # 获取程序开始执行的时间
93 start_time = time.time()
94 param = SpiderParams(startDate='2024-01-02',
95 endDate='2024-01-05',
96 target_type=[TypeEnum.ANIME, TypeEnum.EPISODES, TypeEnum.MOVIE]
97 , url='http://guduodata.com/')
98 res = asyncio.run(startBrowser(param))
99 print(f'程序执行耗时时间:{time.time() - start_time} 长度为:{len(res)}')
100 for it in res:
101 print(it)
1 aiosqlite==0.20.0
2 annotated-types==0.7.0
3 anyio==4.7.0
4 APScheduler==3.11.0
5 asyncmy==0.2.10
6 click==8.1.7
7 exceptiongroup==1.2.2
8 fastapi==0.115.6
9 greenlet==3.1.1
10 h11==0.14.0
11 idna==3.10
12 iso8601==2.1.0
13 playwright==1.49.1
14 pydantic==2.10.4
15 pydantic_core==2.27.2
16 pyee==12.0.0
17 pypika-tortoise==0.3.2
18 pytz==2024.2
19 PyYAML==6.0.2
20 sniffio==1.3.1
21 starlette==0.41.3
22 SuperStream==0.2.6
23 tortoise-orm==0.22.2
24 typing_extensions==4.12.2
25 tzlocal==5.2
26 uvicorn==0.34.0