init
0 parents
Showing
52 changed files
with
732 additions
and
0 deletions
.idea/.gitignore
0 → 100644
.idea/FastAPIProject1.iml
0 → 100644
1 | <?xml version="1.0" encoding="UTF-8"?> | ||
2 | <module type="PYTHON_MODULE" version="4"> | ||
3 | <component name="NewModuleRootManager"> | ||
4 | <content url="file://$MODULE_DIR$" /> | ||
5 | <orderEntry type="jdk" jdkName="Spider" jdkType="Python SDK" /> | ||
6 | <orderEntry type="sourceFolder" forTests="false" /> | ||
7 | </component> | ||
8 | <component name="PackageRequirementsSettings"> | ||
9 | <option name="modifyBaseFiles" value="true" /> | ||
10 | </component> | ||
11 | </module> | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
.idea/dataSources.xml
0 → 100644
1 | <?xml version="1.0" encoding="UTF-8"?> | ||
2 | <project version="4"> | ||
3 | <component name="DataSourceManagerImpl" format="xml" multifile-model="true"> | ||
4 | <data-source source="LOCAL" name="docker" uuid="4b6d465a-8a9f-4d96-bc3c-4d40505aae14"> | ||
5 | <driver-ref>mysql.8</driver-ref> | ||
6 | <synchronize>true</synchronize> | ||
7 | <jdbc-driver>com.mysql.cj.jdbc.Driver</jdbc-driver> | ||
8 | <jdbc-url>jdbc:mysql://localhost:3306</jdbc-url> | ||
9 | <working-dir>$ProjectFileDir$</working-dir> | ||
10 | </data-source> | ||
11 | </component> | ||
12 | </project> | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
.idea/inspectionProfiles/Project_Default.xml
0 → 100644
1 | <component name="InspectionProjectProfileManager"> | ||
2 | <profile version="1.0"> | ||
3 | <option name="myName" value="Project Default" /> | ||
4 | <inspection_tool class="HttpUrlsUsage" enabled="true" level="WEAK WARNING" enabled_by_default="true"> | ||
5 | <option name="ignoredUrls"> | ||
6 | <list> | ||
7 | <option value="http://0.0.0.0" /> | ||
8 | <option value="http://127.0.0.1" /> | ||
9 | <option value="http://activemq.apache.org/schema/" /> | ||
10 | <option value="http://cxf.apache.org/schemas/" /> | ||
11 | <option value="http://guduodata.com" /> | ||
12 | <option value="http://java.sun.com/" /> | ||
13 | <option value="http://javafx.com/fxml" /> | ||
14 | <option value="http://javafx.com/javafx/" /> | ||
15 | <option value="http://json-schema.org/draft" /> | ||
16 | <option value="http://localhost" /> | ||
17 | <option value="http://maven.apache.org/POM/" /> | ||
18 | <option value="http://maven.apache.org/xsd/" /> | ||
19 | <option value="http://primefaces.org/ui" /> | ||
20 | <option value="http://schema.cloudfoundry.org/spring/" /> | ||
21 | <option value="http://schemas.xmlsoap.org/" /> | ||
22 | <option value="http://tiles.apache.org/" /> | ||
23 | <option value="http://www.ibm.com/webservices/xsd" /> | ||
24 | <option value="http://www.jboss.com/xml/ns/" /> | ||
25 | <option value="http://www.jboss.org/j2ee/schema/" /> | ||
26 | <option value="http://www.springframework.org/schema/" /> | ||
27 | <option value="http://www.springframework.org/security/tags" /> | ||
28 | <option value="http://www.springframework.org/tags" /> | ||
29 | <option value="http://www.thymeleaf.org" /> | ||
30 | <option value="http://www.w3.org/" /> | ||
31 | <option value="http://xmlns.jcp.org/" /> | ||
32 | </list> | ||
33 | </option> | ||
34 | </inspection_tool> | ||
35 | <inspection_tool class="PyArgumentListInspection" enabled="true" level="INFORMATION" enabled_by_default="true" /> | ||
36 | <inspection_tool class="PyMethodMayBeStaticInspection" enabled="false" level="WEAK WARNING" enabled_by_default="false" /> | ||
37 | <inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true"> | ||
38 | <option name="ignoredErrors"> | ||
39 | <list> | ||
40 | <option value="E402" /> | ||
41 | <option value="W605" /> | ||
42 | <option value="E501" /> | ||
43 | </list> | ||
44 | </option> | ||
45 | </inspection_tool> | ||
46 | <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true"> | ||
47 | <option name="ignoredErrors"> | ||
48 | <list> | ||
49 | <option value="N803" /> | ||
50 | <option value="N802" /> | ||
51 | <option value="N801" /> | ||
52 | <option value="N806" /> | ||
53 | <option value="N813" /> | ||
54 | </list> | ||
55 | </option> | ||
56 | </inspection_tool> | ||
57 | <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true"> | ||
58 | <option name="ignoredIdentifiers"> | ||
59 | <list> | ||
60 | <option value="tensorflow" /> | ||
61 | <option value="str.get" /> | ||
62 | <option value="admin.Admin._Base__readUsers" /> | ||
63 | <option value="admin.Admin._Base__changeActive" /> | ||
64 | <option value="admin.Admin._Base__changeRole" /> | ||
65 | <option value="admin.Admin._Base__writeGifts" /> | ||
66 | <option value="admin.Admin._Base__reduceGift" /> | ||
67 | <option value="user.User._Base__readUsers" /> | ||
68 | <option value="user.User._Base__readGifts" /> | ||
69 | <option value="urllib.request" /> | ||
70 | </list> | ||
71 | </option> | ||
72 | </inspection_tool> | ||
73 | <inspection_tool class="PydanticInspection" enabled="false" level="WARNING" enabled_by_default="false" /> | ||
74 | </profile> | ||
75 | </component> | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
.idea/misc.xml
0 → 100644
.idea/modules.xml
0 → 100644
1 | <?xml version="1.0" encoding="UTF-8"?> | ||
2 | <project version="4"> | ||
3 | <component name="ProjectModuleManager"> | ||
4 | <modules> | ||
5 | <module fileurl="file://$PROJECT_DIR$/.idea/FastAPIProject1.iml" filepath="$PROJECT_DIR$/.idea/FastAPIProject1.iml" /> | ||
6 | </modules> | ||
7 | </component> | ||
8 | </project> | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
.idea/vcs.xml
0 → 100644
app/__pycache__/main.cpython-310.pyc
0 → 100644
No preview for this file type
app/application.yaml
0 → 100644
1 | server: | ||
2 | port: 7654 | ||
3 | tortoise: | ||
4 | connections: | ||
5 | default: | ||
6 | engine: tortoise.backends.mysql | ||
7 | credentials: | ||
8 | database: fast_api | ||
9 | host: 127.0.0.1 | ||
10 | password: root | ||
11 | port: 3306 | ||
12 | user: root | ||
13 | minsize: 10 | ||
14 | maxsize: 200 | ||
15 | connect_timeout: 30 | ||
16 | echo: true | ||
17 | apps: | ||
18 | models_read: | ||
19 | models: | ||
20 | - app.model.mysql_model | ||
21 | default_connection: default | ||
22 | log_queries: true # 启用日志查询 |
app/config/__init__.py
0 → 100644
File mode changed
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
app/config/app_config.py
0 → 100644
1 | import os | ||
2 | import yaml | ||
3 | |||
4 | from app.schemas.config_schema import AppConfig | ||
5 | |||
6 | import logging | ||
7 | |||
8 | logger = logging.getLogger(__name__) | ||
9 | |||
10 | |||
11 | def getAppConfig(): | ||
12 | logger.info('开始加载AppConfig') | ||
13 | # 获取当前文件的绝对路径 | ||
14 | current_file_path = os.path.abspath(__file__) | ||
15 | # 获取当前文件的上级目录 | ||
16 | parent_directory = os.path.dirname(current_file_path) | ||
17 | # 构建application.yaml文件的路径 | ||
18 | application_yaml_path = os.path.join(parent_directory, '..', 'application.yaml') | ||
19 | with open(application_yaml_path, 'r') as f: | ||
20 | log_config = yaml.safe_load(f) | ||
21 | return AppConfig(**log_config) |
app/config/log_config.py
0 → 100644
1 | import functools | ||
2 | import logging.config | ||
3 | import os | ||
4 | from typing import Any | ||
5 | |||
6 | import yaml | ||
7 | |||
8 | |||
9 | @functools.lru_cache() | ||
10 | def getLogConfig() -> dict[str, Any]: | ||
11 | current_file_path = os.path.abspath(__file__) | ||
12 | # 获取当前文件的上级目录 | ||
13 | parent_directory = os.path.dirname(current_file_path) | ||
14 | # 构建application.yaml文件的路径 | ||
15 | application_yaml_path = os.path.join(parent_directory, '..', 'log-config.yaml') | ||
16 | # 加载日志配置 | ||
17 | with open(application_yaml_path, 'r') as f: | ||
18 | log_config = yaml.safe_load(f) | ||
19 | return log_config | ||
20 | |||
21 | |||
22 | # 配置根日志记录器以供其他模块使用 | ||
23 | logging.config.dictConfig(getLogConfig()) |
app/config/tortoise_config.py
0 → 100644
1 | import os | ||
2 | import yaml | ||
3 | from app.schemas.config_schema import TortoiseConfig | ||
4 | import logging | ||
5 | |||
6 | logger = logging.getLogger(__name__) | ||
7 | |||
8 | |||
9 | def getTortoiseConfig(): | ||
10 | logger.info('开始加载TortoiseConfig') | ||
11 | # 获取当前文件的绝对路径 | ||
12 | current_file_path = os.path.abspath(__file__) | ||
13 | # 获取当前文件的上级目录 | ||
14 | parent_directory = os.path.dirname(current_file_path) | ||
15 | # 构建application.yaml文件的路径 | ||
16 | application_yaml_path = os.path.join(parent_directory, '..', 'application.yaml') | ||
17 | with open(application_yaml_path, 'r') as f: | ||
18 | log_config = yaml.safe_load(f) | ||
19 | return TortoiseConfig(**(log_config.get('tortoise'))) |
app/event/__init__.py
0 → 100644
File mode changed
No preview for this file type
No preview for this file type
app/event/fastapi_event.py
0 → 100644
1 | from contextlib import asynccontextmanager | ||
2 | |||
3 | from tortoise import Tortoise | ||
4 | from app.job.job import scheduler | ||
5 | from fastapi import FastAPI | ||
6 | from app.config.tortoise_config import getTortoiseConfig | ||
7 | import logging | ||
8 | |||
9 | logger = logging.getLogger(__name__) | ||
10 | |||
11 | |||
12 | async def init(): | ||
13 | # 初始化链接 | ||
14 | logger.info("开始初始化数据库") | ||
15 | await Tortoise.init(config=getTortoiseConfig().dict()) | ||
16 | # 生成数据库表数据 | ||
17 | logger.info("开始生成数据库表") | ||
18 | await Tortoise.generate_schemas() | ||
19 | |||
20 | |||
21 | async def close(): | ||
22 | await Tortoise.close_connections() | ||
23 | |||
24 | |||
25 | # 使用asynccontextmanager装饰器定义一个异步上下文管理器函数lifespan | ||
26 | @asynccontextmanager | ||
27 | async def lifespan(app: FastAPI): | ||
28 | # 开始apscheduler | ||
29 | scheduler.start() | ||
30 | logging.info("apscheduler启动完成") | ||
31 | await init() | ||
32 | logging.info("初始化数据库完成") | ||
33 | yield | ||
34 | # 在异步上下文管理器中,"退出上下文"时清理机器学习模型,释放资源 | ||
35 | scheduler.shutdown() | ||
36 | logging.info("apscheduler关闭完成") | ||
37 | await close() | ||
38 | logging.info("关闭数据库完成") |
app/job/__init__.py
0 → 100644
File mode changed
app/job/__pycache__/__init__.cpython-310.pyc
0 → 100644
No preview for this file type
app/job/__pycache__/job.cpython-310.pyc
0 → 100644
No preview for this file type
app/job/job.py
0 → 100644
1 | from datetime import datetime | ||
2 | |||
3 | from apscheduler.schedulers.asyncio import AsyncIOScheduler | ||
4 | |||
5 | |||
6 | # 导入asynccontextmanager用于创建异步上下文管理器 | ||
7 | |||
8 | # 创建一个scheduler实例 | ||
9 | scheduler = AsyncIOScheduler() | ||
10 | |||
11 | |||
12 | # 每分钟执行的定时任务 | ||
13 | @scheduler.scheduled_job('interval', minutes=1) | ||
14 | async def cron_job(): | ||
15 | # 执行任务的内容,例如打印当前时间 | ||
16 | print(f"The current time is {datetime.now()}") | ||
17 | |||
18 | |||
19 |
app/log-config.yaml
0 → 100644
1 | version: 1 | ||
2 | disable_existing_loggers: false | ||
3 | |||
4 | formatters: | ||
5 | standard: | ||
6 | format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s' | ||
7 | |||
8 | handlers: | ||
9 | console: | ||
10 | class: logging.StreamHandler | ||
11 | level: INFO | ||
12 | formatter: standard | ||
13 | stream: ext://sys.stderr | ||
14 | file: | ||
15 | class: logging.handlers.TimedRotatingFileHandler | ||
16 | level: INFO | ||
17 | formatter: standard | ||
18 | filename: 'logs/app.log' # 日志文件的路径和名称 | ||
19 | when: midnight # 按天分割 | ||
20 | interval: 1 # 每天分割一次 | ||
21 | backupCount: 7 # 保留最近7天的日志文件 | ||
22 | encoding: utf8 # 文件编码 | ||
23 | |||
24 | loggers: | ||
25 | '': | ||
26 | handlers: ['console', 'file'] # 使用两个处理器:控制台和文件 | ||
27 | level: INFO | ||
28 | propagate: true |
app/logs/app.log
0 → 100644
This diff could not be displayed because it is too large.
app/main.py
0 → 100644
1 | import logging | ||
2 | |||
3 | import uvicorn | ||
4 | from fastapi import FastAPI | ||
5 | from app.router import spider_router | ||
6 | from app.config.app_config import getAppConfig | ||
7 | from app.event.fastapi_event import lifespan | ||
8 | from app.config.log_config import getLogConfig | ||
9 | |||
10 | app = FastAPI(lifespan=lifespan) | ||
11 | app.include_router(spider_router.spiderRouter) | ||
12 | if __name__ == '__main__': | ||
13 | appConf = getAppConfig() | ||
14 | uvicorn.run("main:app", host="0.0.0.0", | ||
15 | port=appConf.server.port, | ||
16 | log_config=getLogConfig()) | ||
17 | logging.info(f"启动成功->{appConf.server}") |
app/model/__init__.py
0 → 100644
File mode changed
No preview for this file type
No preview for this file type
app/model/mysql_model.py
0 → 100644
1 | from datetime import datetime | ||
2 | |||
3 | from tortoise import Model, fields | ||
4 | |||
5 | from app.schemas.spider_schema import TypeEnum | ||
6 | |||
7 | |||
8 | class SpiderModel(Model): | ||
9 | id = fields.IntField(pk=True) | ||
10 | time = fields.CharField(max_length=10, description="数据时间 yyyy-MM-dd 格式的字符", regex=r'^\d{4}-\d{2}-\d{2}$') | ||
11 | targetType = fields.IntEnumField(TypeEnum, description="数据类型", source_field='target_type') | ||
12 | platform = fields.CharField(max_length=255, description="平台名字") | ||
13 | sourceName = fields.CharField(max_length=255, description="媒体资源名字", source_field='source_name') | ||
14 | score = fields.FloatField(description="热度得分") | ||
15 | createTime = fields.DatetimeField(default=datetime.now, description="创建时间", source_field='create_time') | ||
16 | |||
17 | class Meta: | ||
18 | table = "spider_data" |
app/router/__init__.py
0 → 100644
File mode changed
No preview for this file type
No preview for this file type
app/router/spider_router.py
0 → 100644
1 | # router/users.py | ||
2 | import logging | ||
3 | from typing import Dict, Any | ||
4 | |||
5 | from fastapi import APIRouter, Path | ||
6 | |||
7 | from app.schemas.spider_schema import SpiderParams, TaskInfo | ||
8 | from app.service.spider_job_service import add_job, get_job_info, add_guduo_job | ||
9 | from app.spider import guduo_spider | ||
10 | |||
11 | logger = logging.getLogger(__name__) | ||
12 | spiderRouter = APIRouter(prefix="/spider", tags=["spider"]) | ||
13 | |||
14 | |||
15 | @spiderRouter.post("/start/", summary="启动一个爬虫在后台运行") | ||
16 | async def read_users(param: SpiderParams): | ||
17 | logger.info(f"开始执行怕从参数是:{param}") | ||
18 | res = await guduo_spider.startBrowser(param) | ||
19 | logger.info("爬取数据完成") | ||
20 | return res | ||
21 | |||
22 | |||
23 | @spiderRouter.post("/add_guduo_job", summary="添加一个骨朵爬虫定时任务") | ||
24 | async def add_task_to_job(param: Dict[str, Any]): | ||
25 | jobParam = SpiderParams(**(param.get('taskParam'))) | ||
26 | taskInfo = TaskInfo(**(param.get('taskInfo'))) | ||
27 | add_guduo_job(jobParam, taskInfo) | ||
28 | return '添加骨朵爬虫任务成功' | ||
29 | |||
30 | |||
31 | @spiderRouter.get("/add_job/{taskId}", summary="添加一个定时任务") | ||
32 | async def add_test_job(taskId: int = Path(..., description="要查询的任务的唯一标识符")): | ||
33 | async def test(name: str): | ||
34 | logger.info(f"测试定时任务执行->{name}") | ||
35 | |||
36 | taskInfo = TaskInfo(taskId=taskId, taskStart=True, | ||
37 | executeStrategy='0 0/1 * * * ?', | ||
38 | url='www.baidu.com') | ||
39 | add_job(['test'], taskInfo, test) | ||
40 | |||
41 | |||
42 | @spiderRouter.get("/get_job/{taskId}", summary="获取一个定时任务") | ||
43 | async def get_job(taskId: int = Path(...)): | ||
44 | return get_job_info(taskId) |
app/schemas/__init__.py
0 → 100644
File mode changed
No preview for this file type
No preview for this file type
No preview for this file type
app/schemas/config_schema.py
0 → 100644
1 | import ssl | ||
2 | from typing import Dict, List, Optional, Any | ||
3 | |||
4 | from pydantic import BaseModel, Field | ||
5 | |||
6 | |||
7 | def get_ctx(): | ||
8 | ctx = ssl.create_default_context() | ||
9 | # And in this example we disable validation... | ||
10 | # Please don't do this. Look at the official Python ``ssl`` module documentation | ||
11 | ctx.check_hostname = False | ||
12 | ctx.verify_mode = ssl.CERT_NONE | ||
13 | return ctx | ||
14 | |||
15 | |||
16 | class Credentials(BaseModel): | ||
17 | database: str | ||
18 | host: str | ||
19 | password: str | ||
20 | port: int | ||
21 | user: str | ||
22 | ssl: Any = Field(default_factory=get_ctx) | ||
23 | minsize: Optional[int] = None | ||
24 | maxsize: Optional[int] = None | ||
25 | connect_timeout: Optional[int] = None | ||
26 | echo: Optional[bool] = None | ||
27 | |||
28 | |||
29 | class Connection(BaseModel): | ||
30 | engine: str | ||
31 | credentials: Credentials | ||
32 | |||
33 | |||
34 | class App(BaseModel): | ||
35 | models: List[str] | ||
36 | default_connection: str | ||
37 | |||
38 | |||
39 | class TortoiseConfig(BaseModel): | ||
40 | connections: Dict[str, Connection] | ||
41 | apps: Dict[str, App] | ||
42 | log_queries: bool | ||
43 | |||
44 | |||
45 | class Port(BaseModel): | ||
46 | port: int | ||
47 | |||
48 | |||
49 | class AppConfig(BaseModel): | ||
50 | server: Port |
app/schemas/spider_schema.py
0 → 100644
1 | from datetime import timedelta, datetime | ||
2 | from enum import IntEnum | ||
3 | from typing import List | ||
4 | |||
5 | from apscheduler.job import Job | ||
6 | from pydantic import BaseModel, Field | ||
7 | |||
8 | |||
9 | class TypeEnum(IntEnum): | ||
10 | EPISODES = 1 # 剧集 | ||
11 | MOVIE = 2 # 电影 | ||
12 | ANIME = 3 # 动漫 | ||
13 | |||
14 | def get_precise_positioning(self, other) -> List[str]: | ||
15 | if self.ANIME == other: | ||
16 | return ['.cateIcon_5', | ||
17 | 'li.anime-platform:nth-child(1)'] | ||
18 | if self.MOVIE == other: | ||
19 | return ['.cateIcon_3', | ||
20 | 'ul.type-box:nth-child(1) > li:nth-child(1)'] | ||
21 | if self.EPISODES == other: | ||
22 | return ['.cateIcon_1', | ||
23 | 'div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)', | ||
24 | 'ul.type-box:nth-child(1) > li:nth-child(1)'] | ||
25 | |||
26 | |||
27 | class GuoDuoSpiderResult(BaseModel): | ||
28 | # time 数据时间 yyyy-MM-dd 格式的字符窜 | ||
29 | time: str = Field(..., description="数据时间 yyyy-MM-dd 格式的字符", pattern='^\d{4}-\d{2}-\d{2}$') | ||
30 | # type '1剧集 2电影 3 动漫', | ||
31 | targetType: TypeEnum = Field(..., description="数据类型") | ||
32 | # platform 平台名字 | ||
33 | platform: str | ||
34 | # source_name 媒体资源名字 | ||
35 | sourceName: str | ||
36 | # 热度得分 | ||
37 | score: float | ||
38 | # 创建时间 | ||
39 | createTime: datetime | ||
40 | |||
41 | class Config: | ||
42 | from_attributes = False | ||
43 | |||
44 | |||
45 | class SpiderParams(BaseModel): | ||
46 | startDate: str = Field(..., description="数据时间 yyyy-MM-dd 格式的字符", pattern='^\d{4}-\d{2}-\d{2}$') | ||
47 | endDate: str = Field(..., description="数据时间 yyyy-MM-dd 格式的字符", pattern='^\d{4}-\d{2}-\d{2}$') | ||
48 | # type '1剧集 2电影 3 动漫', | ||
49 | target_type: List[TypeEnum] | ||
50 | # 爬取的地址 | ||
51 | url: str | ||
52 | |||
53 | class Config: | ||
54 | from_attributes = False | ||
55 | |||
56 | def get_time_range(self) -> List[str]: | ||
57 | start_date = self.startDate | ||
58 | end_date = self.endDate | ||
59 | time_range = [] | ||
60 | start_date_obj = datetime.strptime(start_date, '%Y-%m-%d') | ||
61 | end_date_obj = datetime.strptime(end_date, '%Y-%m-%d') | ||
62 | delta = end_date_obj - start_date_obj | ||
63 | for i in range(delta.days + 1): | ||
64 | day = start_date_obj + timedelta(days=i) | ||
65 | time_range.append(day.strftime('%Y-%m-%d')) | ||
66 | return time_range | ||
67 | |||
68 | |||
69 | class ApschedulerJob(BaseModel): | ||
70 | jobId: Job | ||
71 | taskId: int | ||
72 | |||
73 | class Config: | ||
74 | arbitrary_types_allowed = True | ||
75 | |||
76 | |||
77 | class TaskInfo(BaseModel): | ||
78 | taskStart: bool | ||
79 | executeStrategy: str | ||
80 | taskId: int | ||
81 | url: str |
app/service/__init__.py
0 → 100644
File mode changed
No preview for this file type
No preview for this file type
app/service/spider_job_service.py
0 → 100644
1 | import asyncio | ||
2 | import logging | ||
3 | from typing import List, Any | ||
4 | from app.model.mysql_model import SpiderModel | ||
5 | from superstream import Stream | ||
6 | |||
7 | from app.job.job import scheduler | ||
8 | from app.schemas.spider_schema import ApschedulerJob, TaskInfo, SpiderParams, GuoDuoSpiderResult | ||
9 | from app.spider.guduo_spider import startBrowser | ||
10 | |||
11 | logger = logging.getLogger(__name__) | ||
12 | jobs: List[ApschedulerJob] = [] | ||
13 | |||
14 | |||
15 | def add_job(params: List[Any], taskInfo: TaskInfo, task: Any): | ||
16 | cron = taskInfo.executeStrategy | ||
17 | cron_parameterization = cron.split(" ") | ||
18 | minute = cron_parameterization[1] | ||
19 | minute = (None if minute == "?" else minute) | ||
20 | hour = cron_parameterization[2] | ||
21 | hour = (None if hour == "?" else hour) | ||
22 | day = cron_parameterization[3] | ||
23 | day = (None if day == "?" else day) | ||
24 | month = cron_parameterization[4] | ||
25 | month = (None if month == "?" else month) | ||
26 | weekend = cron_parameterization[5] | ||
27 | weekend = (None if weekend == "?" else weekend) | ||
28 | job = scheduler.add_job(task, "cron", | ||
29 | minute=minute, hour=hour, day=day, month=month, week=weekend, | ||
30 | args=params) | ||
31 | jobs.append(ApschedulerJob(jobId=job, taskId=taskInfo.taskId)) | ||
32 | logger.info( | ||
33 | f"添加任务成功,任务id:{taskInfo.taskId},任务执行策略:{taskInfo.executeStrategy}") | ||
34 | |||
35 | |||
36 | def add_guduo_job(taskParam: SpiderParams, taskInfo: TaskInfo): | ||
37 | """ | ||
38 | 添加骨朵怕从任务到调度器中 | ||
39 | :param taskParam: 骨朵爬虫参数 | ||
40 | :param taskInfo: 任务信息 | ||
41 | :return: | ||
42 | """ | ||
43 | add_job([taskParam], taskInfo, scrawl_and_save) | ||
44 | |||
45 | |||
46 | def delete_job(taskId: int): | ||
47 | for job in jobs: | ||
48 | if job.taskId == taskId: | ||
49 | job.jobId.remove() | ||
50 | jobs.remove(job) | ||
51 | logger.info(f"删除任务成功,任务id:{taskId}") | ||
52 | |||
53 | |||
54 | def get_job_info(taskId: int): | ||
55 | job = (Stream(jobs) | ||
56 | .filter(lambda x: x.taskId == taskId) | ||
57 | .find_first()) | ||
58 | return f'job 信息->{job}' | ||
59 | |||
60 | |||
61 | async def scrawl_and_save(taskParam: SpiderParams): | ||
62 | # 执行爬虫获取结果 | ||
63 | results = await startBrowser(taskParam) | ||
64 | asyncTasks = (save_or_update(item) for item in results) | ||
65 | await asyncio.gather(*asyncTasks) | ||
66 | logger.info(f'爬虫任务执行完成,爬取到数据{len(results)}条 保存到数据库完成') | ||
67 | |||
68 | |||
69 | async def save_or_update(result: GuoDuoSpiderResult): | ||
70 | time = result.time | ||
71 | targetType = result.targetType | ||
72 | platform = result.platform | ||
73 | sourceName = result.sourceName | ||
74 | score = result.score | ||
75 | createTime = result.createTime | ||
76 | # 使用SpiderModel 查询 time targetType platform sourceName 等于以上的值是否存在如果存在就更新不存在就插入 | ||
77 | obj = await SpiderModel.get_or_none( | ||
78 | time=time, | ||
79 | targetType=targetType, | ||
80 | platform=platform, | ||
81 | sourceName=sourceName | ||
82 | ) | ||
83 | if obj: | ||
84 | for key, value in result.__dict__.items(): | ||
85 | setattr(obj, key, value) | ||
86 | else: | ||
87 | obj = await SpiderModel.create( | ||
88 | time=time, | ||
89 | targetType=targetType, | ||
90 | platform=platform, | ||
91 | sourceName=sourceName, | ||
92 | score=score, | ||
93 | createTime=createTime | ||
94 | ) | ||
95 | await obj.save() |
app/spider/__init__.py
0 → 100644
File mode changed
No preview for this file type
No preview for this file type
app/spider/guduo_spider.py
0 → 100644
1 | import asyncio | ||
2 | import time | ||
3 | from datetime import datetime | ||
4 | from typing import List, Optional | ||
5 | from playwright.async_api import Page, async_playwright | ||
6 | from superstream import Stream | ||
7 | from app.schemas.spider_schema import GuoDuoSpiderResult, SpiderParams, TypeEnum | ||
8 | import logging | ||
9 | |||
10 | logger = logging.getLogger(__name__) | ||
11 | |||
12 | |||
13 | async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: | ||
14 | time_range = spiderParam.get_time_range() | ||
15 | url = spiderParam.url | ||
16 | scrawl_types = spiderParam.target_type | ||
17 | async with async_playwright() as p: | ||
18 | browser = await p.firefox.launch(headless=False) | ||
19 | # 创建指定个数的浏览器页面 | ||
20 | pages = await asyncio.gather(*(browser.new_page() | ||
21 | for _ in range(len(time_range)))) | ||
22 | # 同时循环time_range与pages 去调用hand_one_data异步方法 | ||
23 | results = await asyncio.gather(*(hand_one_data(time_range[i], pages[i], | ||
24 | scrawl_types, url) | ||
25 | for i in range(len(time_range)))) | ||
26 | return (Stream(results) | ||
27 | .filter(lambda x: x is not None) | ||
28 | .filter(lambda x: len(x) > 0) | ||
29 | .flat_map(lambda x: Stream(x)) | ||
30 | .to_list()) | ||
31 | |||
32 | |||
33 | async def hand_one_data(targetDate: str, | ||
34 | page: Page, | ||
35 | scrawl_types: List[TypeEnum], | ||
36 | url: str) -> List[GuoDuoSpiderResult]: | ||
37 | # 把date这个日期对象解析为年、月、日 | ||
38 | year, month, day = targetDate.split('-') | ||
39 | day = day.replace('0', '') | ||
40 | month = int(month) - 1 | ||
41 | await page.goto(url) | ||
42 | # 等待遮罩层消失 | ||
43 | await page.click('//*[@id="currentDateTxt"]') | ||
44 | data_css_selector = (f'#day span.pika-button.pika-day[type="button"]' | ||
45 | f'[data-pika-year="{year}"]' | ||
46 | f'[data-pika-month="{month}"]' | ||
47 | f'[data-pika-day="{day}"]') | ||
48 | doc = page.locator(data_css_selector) | ||
49 | # 点击指定日期 | ||
50 | await doc.click() | ||
51 | # 最后一步修正定位 | ||
52 | css_selectors = (Stream(scrawl_types) | ||
53 | .map(lambda x: (x, x.get_precise_positioning(x))) | ||
54 | .group_by(lambda x: x[0])) | ||
55 | results = [] | ||
56 | for key, value in css_selectors.items(): | ||
57 | logger.info(f'开始爬取 {targetDate} 类型是{value[0][0]} 的数据') | ||
58 | for css_selector in value[0][1]: | ||
59 | await page.click(css_selector) | ||
60 | result = await get_data(page, key.value, targetDate) | ||
61 | if result: | ||
62 | Stream(result).for_each(lambda x: results.append(x)) | ||
63 | return results | ||
64 | |||
65 | |||
66 | async def get_data(targetPage: Page, scrawl_type: int, target_time: str) -> Optional[List[GuoDuoSpiderResult]]: | ||
67 | target_data_css_selector = '.rank-box' | ||
68 | table = targetPage.locator(target_data_css_selector) | ||
69 | if table is None: | ||
70 | print(f'当前页面获取table为空:{scrawl_type}') | ||
71 | return None | ||
72 | # 获取当前这个doc_obj元素下面所有的tr标签 | ||
73 | trs = table.locator('tr') | ||
74 | if trs is None: | ||
75 | print(f'当前页面获取tr为空:{scrawl_type}') | ||
76 | # 循环遍历trs下面的每个tr标签下面的td标签 | ||
77 | at = await trs.all_text_contents() | ||
78 | result = (Stream(at) | ||
79 | .filter(lambda x: x.strip() != '') | ||
80 | .filter(lambda x: len(x.split(' ')) > 6) | ||
81 | .map(lambda x: GuoDuoSpiderResult(time=target_time, | ||
82 | targetType=TypeEnum(scrawl_type), | ||
83 | createTime=datetime.now(), | ||
84 | platform='all', | ||
85 | score=float(x.split(' ')[-1]), | ||
86 | sourceName=x.split(' ')[5])) | ||
87 | .to_list()) | ||
88 | return result | ||
89 | |||
90 | |||
91 | if __name__ == '__main__': | ||
92 | # 获取程序开始执行的时间 | ||
93 | start_time = time.time() | ||
94 | param = SpiderParams(startDate='2024-01-02', | ||
95 | endDate='2024-01-05', | ||
96 | target_type=[TypeEnum.ANIME, TypeEnum.EPISODES, TypeEnum.MOVIE] | ||
97 | , url='http://guduodata.com/') | ||
98 | res = asyncio.run(startBrowser(param)) | ||
99 | print(f'程序执行耗时时间:{time.time() - start_time} 长度为:{len(res)}') | ||
100 | for it in res: | ||
101 | print(it) |
requirements.txt
0 → 100644
1 | aiosqlite==0.20.0 | ||
2 | annotated-types==0.7.0 | ||
3 | anyio==4.7.0 | ||
4 | APScheduler==3.11.0 | ||
5 | asyncmy==0.2.10 | ||
6 | click==8.1.7 | ||
7 | exceptiongroup==1.2.2 | ||
8 | fastapi==0.115.6 | ||
9 | greenlet==3.1.1 | ||
10 | h11==0.14.0 | ||
11 | idna==3.10 | ||
12 | iso8601==2.1.0 | ||
13 | playwright==1.49.1 | ||
14 | pydantic==2.10.4 | ||
15 | pydantic_core==2.27.2 | ||
16 | pyee==12.0.0 | ||
17 | pypika-tortoise==0.3.2 | ||
18 | pytz==2024.2 | ||
19 | PyYAML==6.0.2 | ||
20 | sniffio==1.3.1 | ||
21 | starlette==0.41.3 | ||
22 | SuperStream==0.2.6 | ||
23 | tortoise-orm==0.22.2 | ||
24 | typing_extensions==4.12.2 | ||
25 | tzlocal==5.2 | ||
26 | uvicorn==0.34.0 |
-
Please register or sign in to post a comment