添加从数据库获取定时任务信息
Showing
8 changed files
with
164 additions
and
21 deletions
... | @@ -5,9 +5,9 @@ tortoise: | ... | @@ -5,9 +5,9 @@ tortoise: |
5 | default: | 5 | default: |
6 | engine: tortoise.backends.mysql | 6 | engine: tortoise.backends.mysql |
7 | credentials: | 7 | credentials: |
8 | database: fast_api | 8 | database: cms_test |
9 | host: 127.0.0.1 | 9 | host: 172.0.31.57 |
10 | password: root | 10 | password: Tjlh@2017 |
11 | port: 3306 | 11 | port: 3306 |
12 | user: root | 12 | user: root |
13 | minsize: 10 | 13 | minsize: 10 |
... | @@ -20,3 +20,7 @@ tortoise: | ... | @@ -20,3 +20,7 @@ tortoise: |
20 | - app.model.mysql_model | 20 | - app.model.mysql_model |
21 | default_connection: default | 21 | default_connection: default |
22 | log_queries: true # 启用日志查询 | 22 | log_queries: true # 启用日志查询 |
23 | |||
24 | getSpider: | ||
25 | # 设置设置爬虫定时任务时间间隔 单位是分钟 | ||
26 | interval: 5 | ||
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
1 | import functools | ||
1 | import os | 2 | import os |
2 | import yaml | 3 | import yaml |
3 | 4 | ||
... | @@ -7,8 +8,8 @@ import logging | ... | @@ -7,8 +8,8 @@ import logging |
7 | 8 | ||
8 | logger = logging.getLogger(__name__) | 9 | logger = logging.getLogger(__name__) |
9 | 10 | ||
10 | 11 | @functools.lru_cache() | |
11 | def getAppConfig(): | 12 | def getAppConfig()->AppConfig: |
12 | logger.info('开始加载AppConfig') | 13 | logger.info('开始加载AppConfig') |
13 | # 获取当前文件的绝对路径 | 14 | # 获取当前文件的绝对路径 |
14 | current_file_path = os.path.abspath(__file__) | 15 | current_file_path = os.path.abspath(__file__) | ... | ... |
1 | import functools | ||
1 | import os | 2 | import os |
2 | import yaml | 3 | import yaml |
3 | from app.schemas.config_schema import TortoiseConfig | 4 | from app.schemas.config_schema import TortoiseConfig |
... | @@ -5,7 +6,7 @@ import logging | ... | @@ -5,7 +6,7 @@ import logging |
5 | 6 | ||
6 | logger = logging.getLogger(__name__) | 7 | logger = logging.getLogger(__name__) |
7 | 8 | ||
8 | 9 | @functools.lru_cache() | |
9 | def getTortoiseConfig(): | 10 | def getTortoiseConfig(): |
10 | logger.info('开始加载TortoiseConfig') | 11 | logger.info('开始加载TortoiseConfig') |
11 | # 获取当前文件的绝对路径 | 12 | # 获取当前文件的绝对路径 | ... | ... |
1 | from datetime import datetime | 1 | from datetime import datetime, timedelta |
2 | 2 | import hashlib | |
3 | import logging | ||
4 | from typing import Dict, List, Tuple | ||
5 | from app.model.mysql_model import XWebCrawler | ||
3 | from apscheduler.schedulers.asyncio import AsyncIOScheduler | 6 | from apscheduler.schedulers.asyncio import AsyncIOScheduler |
7 | from superstream import Stream | ||
8 | from app.config.app_config import getAppConfig | ||
9 | from app.schemas.spider_schema import ApschedulerJob, SpiderParams, TaskInfo, TypeEnum | ||
4 | 10 | ||
11 | logger = logging.getLogger(__name__) | ||
5 | 12 | ||
6 | # 导入asynccontextmanager用于创建异步上下文管理器 | 13 | # 导入asynccontextmanager用于创建异步上下文管理器 |
7 | 14 | ||
... | @@ -9,11 +16,95 @@ from apscheduler.schedulers.asyncio import AsyncIOScheduler | ... | @@ -9,11 +16,95 @@ from apscheduler.schedulers.asyncio import AsyncIOScheduler |
9 | scheduler = AsyncIOScheduler() | 16 | scheduler = AsyncIOScheduler() |
10 | 17 | ||
11 | 18 | ||
12 | # 每分钟执行的定时任务 | 19 | @scheduler.scheduled_job("interval", minutes=getAppConfig().getSpider.get("interval")) |
13 | @scheduler.scheduled_job('interval', minutes=1) | 20 | async def get_spider_config(): |
14 | async def cron_job(): | 21 | from app.service.spider_job_service import ( |
15 | # 执行任务的内容,例如打印当前时间 | 22 | add_job, |
16 | print(f"The current time is {datetime.now()}") | 23 | add_guduo_job, |
24 | delete_job, | ||
25 | get_job_info, | ||
26 | ) | ||
27 | |||
28 | logger.info("开始获取爬虫配置") | ||
29 | web_crawlers: List[XWebCrawler] = await XWebCrawler.all() | ||
30 | if not web_crawlers: | ||
31 | logger.info("未获取到爬虫任务") | ||
32 | return | ||
33 | status_group: Dict[int, List[XWebCrawler]] = Stream(web_crawlers).group_by( | ||
34 | lambda x: x.status | ||
35 | ) | ||
36 | active_jobs = status_group.get(1, []) | ||
37 | if not active_jobs: | ||
38 | logger.info("未获取到启用的爬虫任务") | ||
39 | return | ||
40 | # 获取已经注册的任务 | ||
41 | jobs: List[ApschedulerJob] = get_job_info() | ||
42 | # 处理已经关闭的任务 | ||
43 | regist_jobs: Dict[int, ApschedulerJob] = Stream(jobs).group_by(lambda x: x.taskId) | ||
44 | for job in status_group.get(0, []): | ||
45 | if job.id in regist_jobs: | ||
46 | delete_job(job.id) | ||
47 | logger.info(f"删除任务成功,任务id:{job.id}") | ||
48 | # 处理新的定时任务 | ||
49 | for job in status_group.get(1, []): | ||
50 | # 构建参数 | ||
51 | # 获得今天的日期与昨天的日期 格式是 yyyy-MM-dd | ||
52 | dates = get_dates() | ||
53 | types = [TypeEnum.EPISODES, TypeEnum.MOVIE, TypeEnum.ANIME, TypeEnum.VARIETY] | ||
54 | md5 = calculate_md5(job.id, job.target_url, job.param_info, job.cron) | ||
55 | # 计算 | ||
56 | param = SpiderParams( | ||
57 | startDate=dates[1], | ||
58 | endDate=dates[0], | ||
59 | target_type=types, | ||
60 | url=job.target_url, | ||
61 | ) | ||
62 | taskInfo = TaskInfo( | ||
63 | taskStart=True, | ||
64 | executeStrategy=job.cron, | ||
65 | taskId=job.id, | ||
66 | url=job.target_url, | ||
67 | taskMd5=md5, | ||
68 | ) | ||
69 | if job.id not in regist_jobs: | ||
70 | add_guduo_job(param, taskInfo) | ||
71 | logger.info(f"添加任务成功,任务id:{job.id}") | ||
72 | else: | ||
73 | exists_jobs: List[ApschedulerJob] = get_job_info(job.id) | ||
74 | if exists_jobs[0].taskMd5 != md5: | ||
75 | delete_job(job.id) | ||
76 | add_guduo_job(param, taskInfo) | ||
77 | logger.info(f"更新任务成功,任务id:{job.id}") | ||
78 | logger.info("爬虫配置完成") | ||
79 | |||
80 | |||
81 | def calculate_md5(*fields): | ||
82 | """ | ||
83 | 计算多个字段的MD5值 | ||
84 | :param fields: 多个字段 | ||
85 | :return: MD5值 | ||
86 | """ | ||
87 | md5 = hashlib.md5() | ||
88 | for field in fields: | ||
89 | md5.update(str(field).encode("utf-8")) | ||
90 | return md5.hexdigest() | ||
91 | |||
17 | 92 | ||
93 | def get_dates() -> Tuple[str, str]: | ||
94 | """ | ||
95 | 获取今天与昨天的日期 格式是 yyyy-MM-dd | ||
96 | :return: Tuple[str, str] | ||
97 | """ | ||
98 | today = datetime.today() | ||
99 | yesterday = today - timedelta(days=1) | ||
100 | today_str = today.strftime("%Y-%m-%d") | ||
101 | yesterday_str = yesterday.strftime("%Y-%m-%d") | ||
102 | return today_str, yesterday_str | ||
18 | 103 | ||
19 | 104 | ||
105 | if __name__ == "__main__": | ||
106 | new_dict = {} | ||
107 | print(new_dict.get("a")) | ||
108 | print(hash((1, "b", True))) | ||
109 | new_dict.update({"a": 1}) | ||
110 | print("a" in new_dict) | ... | ... |
... | @@ -6,13 +6,54 @@ from app.schemas.spider_schema import TypeEnum | ... | @@ -6,13 +6,54 @@ from app.schemas.spider_schema import TypeEnum |
6 | 6 | ||
7 | 7 | ||
8 | class SpiderModel(Model): | 8 | class SpiderModel(Model): |
9 | """ | ||
10 | 爬虫结果数据模型 | ||
11 | """ | ||
12 | |||
9 | id = fields.IntField(pk=True) | 13 | id = fields.IntField(pk=True) |
10 | time = fields.CharField(max_length=10, description="数据时间 yyyy-MM-dd 格式的字符", regex=r'^\d{4}-\d{2}-\d{2}$') | 14 | time = fields.CharField( |
11 | targetType = fields.IntEnumField(TypeEnum, description="数据类型", source_field='target_type') | 15 | max_length=10, |
16 | description="数据时间 yyyy-MM-dd 格式的字符", | ||
17 | regex=r"^\d{4}-\d{2}-\d{2}$", | ||
18 | ) | ||
19 | targetType = fields.IntEnumField( | ||
20 | TypeEnum, description="数据类型", source_field="target_type" | ||
21 | ) | ||
12 | platform = fields.CharField(max_length=255, description="平台名字") | 22 | platform = fields.CharField(max_length=255, description="平台名字") |
13 | sourceName = fields.CharField(max_length=255, description="媒体资源名字", source_field='source_name') | 23 | sourceName = fields.CharField( |
24 | max_length=255, description="媒体资源名字", source_field="source_name" | ||
25 | ) | ||
14 | score = fields.FloatField(description="热度得分") | 26 | score = fields.FloatField(description="热度得分") |
15 | createTime = fields.DatetimeField(default=datetime.now, description="创建时间", source_field='create_time') | 27 | createTime = fields.DatetimeField( |
16 | updateTime = fields.DatetimeField(default=datetime.now, description="更新时间", source_field='update_time') | 28 | default=datetime.now, description="创建时间", source_field="create_time" |
29 | ) | ||
30 | updateTime = fields.DatetimeField( | ||
31 | default=datetime.now, description="更新时间", source_field="update_time" | ||
32 | ) | ||
33 | |||
17 | class Meta: | 34 | class Meta: |
18 | table = "spider_data" | 35 | table = "spider_data" |
36 | |||
37 | |||
38 | class XWebCrawler(Model): | ||
39 | id = fields.BigIntField(pk=True, description="ID") | ||
40 | code = fields.CharField(max_length=64, unique=True, description="标识(crawl_)") | ||
41 | name = fields.CharField(max_length=255, description="名称") | ||
42 | description = fields.TextField(null=True, description="描述") | ||
43 | type = fields.IntField(default=0, description="类型0:普通;(待扩展)") | ||
44 | proxy_mode = fields.IntField( | ||
45 | default=0, description="代理模式0:不使用代理;(待扩展)" | ||
46 | ) | ||
47 | status = fields.IntField(description="状态 0:关闭;1:启用;") | ||
48 | cron = fields.CharField( | ||
49 | max_length=32, description="执行周期,标准cron表达式(json)" | ||
50 | ) | ||
51 | param_info = fields.TextField(null=True, description="参数信息") | ||
52 | target_url = fields.CharField(max_length=1000, null=True, description="目标地址") | ||
53 | sequence = fields.IntField(default=0, description="显示顺序") | ||
54 | create_time = fields.DatetimeField(null=True, description="创建时间") | ||
55 | update_time = fields.DatetimeField(null=True, description="更新时间") | ||
56 | |||
57 | class Meta: | ||
58 | table = "x_web_crawler" | ||
59 | table_description = "爬虫表" | ... | ... |
... | @@ -81,6 +81,7 @@ class SpiderParams(BaseModel): | ... | @@ -81,6 +81,7 @@ class SpiderParams(BaseModel): |
81 | class ApschedulerJob(BaseModel): | 81 | class ApschedulerJob(BaseModel): |
82 | jobId: Job | 82 | jobId: Job |
83 | taskId: int | 83 | taskId: int |
84 | taskMd5: str | ||
84 | 85 | ||
85 | class Config: | 86 | class Config: |
86 | arbitrary_types_allowed = True | 87 | arbitrary_types_allowed = True |
... | @@ -91,3 +92,4 @@ class TaskInfo(BaseModel): | ... | @@ -91,3 +92,4 @@ class TaskInfo(BaseModel): |
91 | executeStrategy: str | 92 | executeStrategy: str |
92 | taskId: int | 93 | taskId: int |
93 | url: str | 94 | url: str |
95 | taskMd5: str | ... | ... |
... | @@ -40,7 +40,7 @@ def add_job(params: List[Any], taskInfo: TaskInfo, task: Any): | ... | @@ -40,7 +40,7 @@ def add_job(params: List[Any], taskInfo: TaskInfo, task: Any): |
40 | week=weekend, | 40 | week=weekend, |
41 | args=params, | 41 | args=params, |
42 | ) | 42 | ) |
43 | jobs.append(ApschedulerJob(jobId=job, taskId=taskInfo.taskId)) | 43 | jobs.append(ApschedulerJob(jobId=job, taskId=taskInfo.taskId,taskMd5=taskInfo.md5())) |
44 | logger.info( | 44 | logger.info( |
45 | f"添加任务成功,任务id:{taskInfo.taskId},任务执行策略:{taskInfo.executeStrategy}" | 45 | f"添加任务成功,任务id:{taskInfo.taskId},任务执行策略:{taskInfo.executeStrategy}" |
46 | ) | 46 | ) |
... | @@ -64,8 +64,10 @@ def delete_job(taskId: int): | ... | @@ -64,8 +64,10 @@ def delete_job(taskId: int): |
64 | logger.info(f"删除任务成功,任务id:{taskId}") | 64 | logger.info(f"删除任务成功,任务id:{taskId}") |
65 | 65 | ||
66 | 66 | ||
67 | def get_job_info(taskId: int): | 67 | def get_job_info(taskId: int) -> List[ApschedulerJob]: |
68 | job = Stream(jobs).filter(lambda x: x.taskId == taskId).find_first() | 68 | if taskId is None: |
69 | return jobs | ||
70 | job = Stream(jobs).filter(lambda x: x.taskId == taskId).to_list() | ||
69 | return f"job 信息->{job}" | 71 | return f"job 信息->{job}" |
70 | 72 | ||
71 | 73 | ... | ... |
-
Please register or sign in to post a comment