Commit 50c996ca 50c996cab94055b1bb62dd2da5cce62d08a90373 by wenxin

添加从数据库获取定时任务信息

1 parent 74924ed8
......@@ -5,9 +5,9 @@ tortoise:
default:
engine: tortoise.backends.mysql
credentials:
database: fast_api
host: 127.0.0.1
password: root
database: cms_test
host: 172.0.31.57
password: Tjlh@2017
port: 3306
user: root
minsize: 10
......@@ -20,3 +20,7 @@ tortoise:
- app.model.mysql_model
default_connection: default
log_queries: true # 启用日志查询
getSpider:
# 设置设置爬虫定时任务时间间隔 单位是分钟
interval: 5
\ No newline at end of file
......
import functools
import os
import yaml
......@@ -7,8 +8,8 @@ import logging
logger = logging.getLogger(__name__)
def getAppConfig():
@functools.lru_cache()
def getAppConfig()->AppConfig:
logger.info('开始加载AppConfig')
# 获取当前文件的绝对路径
current_file_path = os.path.abspath(__file__)
......
import functools
import os
import yaml
from app.schemas.config_schema import TortoiseConfig
......@@ -5,7 +6,7 @@ import logging
logger = logging.getLogger(__name__)
@functools.lru_cache()
def getTortoiseConfig():
logger.info('开始加载TortoiseConfig')
# 获取当前文件的绝对路径
......
from datetime import datetime
from datetime import datetime, timedelta
import hashlib
import logging
from typing import Dict, List, Tuple
from app.model.mysql_model import XWebCrawler
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from superstream import Stream
from app.config.app_config import getAppConfig
from app.schemas.spider_schema import ApschedulerJob, SpiderParams, TaskInfo, TypeEnum
logger = logging.getLogger(__name__)
# 导入asynccontextmanager用于创建异步上下文管理器
......@@ -9,11 +16,95 @@ from apscheduler.schedulers.asyncio import AsyncIOScheduler
scheduler = AsyncIOScheduler()
# 每分钟执行的定时任务
@scheduler.scheduled_job('interval', minutes=1)
async def cron_job():
# 执行任务的内容,例如打印当前时间
print(f"The current time is {datetime.now()}")
@scheduler.scheduled_job("interval", minutes=getAppConfig().getSpider.get("interval"))
async def get_spider_config():
from app.service.spider_job_service import (
add_job,
add_guduo_job,
delete_job,
get_job_info,
)
logger.info("开始获取爬虫配置")
web_crawlers: List[XWebCrawler] = await XWebCrawler.all()
if not web_crawlers:
logger.info("未获取到爬虫任务")
return
status_group: Dict[int, List[XWebCrawler]] = Stream(web_crawlers).group_by(
lambda x: x.status
)
active_jobs = status_group.get(1, [])
if not active_jobs:
logger.info("未获取到启用的爬虫任务")
return
# 获取已经注册的任务
jobs: List[ApschedulerJob] = get_job_info()
# 处理已经关闭的任务
regist_jobs: Dict[int, ApschedulerJob] = Stream(jobs).group_by(lambda x: x.taskId)
for job in status_group.get(0, []):
if job.id in regist_jobs:
delete_job(job.id)
logger.info(f"删除任务成功,任务id:{job.id}")
# 处理新的定时任务
for job in status_group.get(1, []):
# 构建参数
# 获得今天的日期与昨天的日期 格式是 yyyy-MM-dd
dates = get_dates()
types = [TypeEnum.EPISODES, TypeEnum.MOVIE, TypeEnum.ANIME, TypeEnum.VARIETY]
md5 = calculate_md5(job.id, job.target_url, job.param_info, job.cron)
# 计算
param = SpiderParams(
startDate=dates[1],
endDate=dates[0],
target_type=types,
url=job.target_url,
)
taskInfo = TaskInfo(
taskStart=True,
executeStrategy=job.cron,
taskId=job.id,
url=job.target_url,
taskMd5=md5,
)
if job.id not in regist_jobs:
add_guduo_job(param, taskInfo)
logger.info(f"添加任务成功,任务id:{job.id}")
else:
exists_jobs: List[ApschedulerJob] = get_job_info(job.id)
if exists_jobs[0].taskMd5 != md5:
delete_job(job.id)
add_guduo_job(param, taskInfo)
logger.info(f"更新任务成功,任务id:{job.id}")
logger.info("爬虫配置完成")
def calculate_md5(*fields):
"""
计算多个字段的MD5值
:param fields: 多个字段
:return: MD5值
"""
md5 = hashlib.md5()
for field in fields:
md5.update(str(field).encode("utf-8"))
return md5.hexdigest()
def get_dates() -> Tuple[str, str]:
"""
获取今天与昨天的日期 格式是 yyyy-MM-dd
:return: Tuple[str, str]
"""
today = datetime.today()
yesterday = today - timedelta(days=1)
today_str = today.strftime("%Y-%m-%d")
yesterday_str = yesterday.strftime("%Y-%m-%d")
return today_str, yesterday_str
if __name__ == "__main__":
new_dict = {}
print(new_dict.get("a"))
print(hash((1, "b", True)))
new_dict.update({"a": 1})
print("a" in new_dict)
......
......@@ -6,13 +6,54 @@ from app.schemas.spider_schema import TypeEnum
class SpiderModel(Model):
"""
爬虫结果数据模型
"""
id = fields.IntField(pk=True)
time = fields.CharField(max_length=10, description="数据时间 yyyy-MM-dd 格式的字符", regex=r'^\d{4}-\d{2}-\d{2}$')
targetType = fields.IntEnumField(TypeEnum, description="数据类型", source_field='target_type')
time = fields.CharField(
max_length=10,
description="数据时间 yyyy-MM-dd 格式的字符",
regex=r"^\d{4}-\d{2}-\d{2}$",
)
targetType = fields.IntEnumField(
TypeEnum, description="数据类型", source_field="target_type"
)
platform = fields.CharField(max_length=255, description="平台名字")
sourceName = fields.CharField(max_length=255, description="媒体资源名字", source_field='source_name')
sourceName = fields.CharField(
max_length=255, description="媒体资源名字", source_field="source_name"
)
score = fields.FloatField(description="热度得分")
createTime = fields.DatetimeField(default=datetime.now, description="创建时间", source_field='create_time')
updateTime = fields.DatetimeField(default=datetime.now, description="更新时间", source_field='update_time')
createTime = fields.DatetimeField(
default=datetime.now, description="创建时间", source_field="create_time"
)
updateTime = fields.DatetimeField(
default=datetime.now, description="更新时间", source_field="update_time"
)
class Meta:
table = "spider_data"
class XWebCrawler(Model):
id = fields.BigIntField(pk=True, description="ID")
code = fields.CharField(max_length=64, unique=True, description="标识(crawl_)")
name = fields.CharField(max_length=255, description="名称")
description = fields.TextField(null=True, description="描述")
type = fields.IntField(default=0, description="类型0:普通;(待扩展)")
proxy_mode = fields.IntField(
default=0, description="代理模式0:不使用代理;(待扩展)"
)
status = fields.IntField(description="状态 0:关闭;1:启用;")
cron = fields.CharField(
max_length=32, description="执行周期,标准cron表达式(json)"
)
param_info = fields.TextField(null=True, description="参数信息")
target_url = fields.CharField(max_length=1000, null=True, description="目标地址")
sequence = fields.IntField(default=0, description="显示顺序")
create_time = fields.DatetimeField(null=True, description="创建时间")
update_time = fields.DatetimeField(null=True, description="更新时间")
class Meta:
table = "x_web_crawler"
table_description = "爬虫表"
......
......@@ -48,3 +48,4 @@ class Port(BaseModel):
class AppConfig(BaseModel):
server: Port
getSpider:Dict[str,int]
......
......@@ -81,6 +81,7 @@ class SpiderParams(BaseModel):
class ApschedulerJob(BaseModel):
jobId: Job
taskId: int
taskMd5: str
class Config:
arbitrary_types_allowed = True
......@@ -91,3 +92,4 @@ class TaskInfo(BaseModel):
executeStrategy: str
taskId: int
url: str
taskMd5: str
......
......@@ -40,7 +40,7 @@ def add_job(params: List[Any], taskInfo: TaskInfo, task: Any):
week=weekend,
args=params,
)
jobs.append(ApschedulerJob(jobId=job, taskId=taskInfo.taskId))
jobs.append(ApschedulerJob(jobId=job, taskId=taskInfo.taskId,taskMd5=taskInfo.md5()))
logger.info(
f"添加任务成功,任务id:{taskInfo.taskId},任务执行策略:{taskInfo.executeStrategy}"
)
......@@ -64,8 +64,10 @@ def delete_job(taskId: int):
logger.info(f"删除任务成功,任务id:{taskId}")
def get_job_info(taskId: int):
job = Stream(jobs).filter(lambda x: x.taskId == taskId).find_first()
def get_job_info(taskId: int) -> List[ApschedulerJob]:
if taskId is None:
return jobs
job = Stream(jobs).filter(lambda x: x.taskId == taskId).to_list()
return f"job 信息->{job}"
......