Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
文鑫
/
guduo_spider
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
Commit
50c996ca
...
50c996cab94055b1bb62dd2da5cce62d08a90373
authored
2024-12-23 13:28:39 +0800
by
wenxin
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
添加从数据库获取定时任务信息
1 parent
74924ed8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
164 additions
and
21 deletions
app/application.yaml
app/config/app_config.py
app/config/tortoise_config.py
app/job/job.py
app/model/mysql_model.py
app/schemas/config_schema.py
app/schemas/spider_schema.py
app/service/spider_job_service.py
app/application.yaml
View file @
50c996c
...
...
@@ -5,9 +5,9 @@ tortoise:
default
:
engine
:
tortoise.backends.mysql
credentials
:
database
:
fast_api
host
:
1
27.0.0.1
password
:
root
database
:
cms_test
host
:
1
72.0.31.57
password
:
Tjlh@2017
port
:
3306
user
:
root
minsize
:
10
...
...
@@ -20,3 +20,7 @@ tortoise:
-
app.model.mysql_model
default_connection
:
default
log_queries
:
true
# 启用日志查询
getSpider
:
# 设置设置爬虫定时任务时间间隔 单位是分钟
interval
:
5
\ No newline at end of file
...
...
app/config/app_config.py
View file @
50c996c
import
functools
import
os
import
yaml
...
...
@@ -7,8 +8,8 @@ import logging
logger
=
logging
.
getLogger
(
__name__
)
def
getAppConfig
():
@functools.lru_cache
()
def
getAppConfig
()
->
AppConfig
:
logger
.
info
(
'开始加载AppConfig'
)
# 获取当前文件的绝对路径
current_file_path
=
os
.
path
.
abspath
(
__file__
)
...
...
app/config/tortoise_config.py
View file @
50c996c
import
functools
import
os
import
yaml
from
app.schemas.config_schema
import
TortoiseConfig
...
...
@@ -5,7 +6,7 @@ import logging
logger
=
logging
.
getLogger
(
__name__
)
@functools.lru_cache
()
def
getTortoiseConfig
():
logger
.
info
(
'开始加载TortoiseConfig'
)
# 获取当前文件的绝对路径
...
...
app/job/job.py
View file @
50c996c
from
datetime
import
datetime
from
datetime
import
datetime
,
timedelta
import
hashlib
import
logging
from
typing
import
Dict
,
List
,
Tuple
from
app.model.mysql_model
import
XWebCrawler
from
apscheduler.schedulers.asyncio
import
AsyncIOScheduler
from
superstream
import
Stream
from
app.config.app_config
import
getAppConfig
from
app.schemas.spider_schema
import
ApschedulerJob
,
SpiderParams
,
TaskInfo
,
TypeEnum
logger
=
logging
.
getLogger
(
__name__
)
# 导入asynccontextmanager用于创建异步上下文管理器
...
...
@@ -9,11 +16,95 @@ from apscheduler.schedulers.asyncio import AsyncIOScheduler
scheduler
=
AsyncIOScheduler
()
# 每分钟执行的定时任务
@scheduler.scheduled_job
(
'interval'
,
minutes
=
1
)
async
def
cron_job
():
# 执行任务的内容,例如打印当前时间
print
(
f
"The current time is {datetime.now()}"
)
@scheduler.scheduled_job
(
"interval"
,
minutes
=
getAppConfig
()
.
getSpider
.
get
(
"interval"
))
async
def
get_spider_config
():
from
app.service.spider_job_service
import
(
add_job
,
add_guduo_job
,
delete_job
,
get_job_info
,
)
logger
.
info
(
"开始获取爬虫配置"
)
web_crawlers
:
List
[
XWebCrawler
]
=
await
XWebCrawler
.
all
()
if
not
web_crawlers
:
logger
.
info
(
"未获取到爬虫任务"
)
return
status_group
:
Dict
[
int
,
List
[
XWebCrawler
]]
=
Stream
(
web_crawlers
)
.
group_by
(
lambda
x
:
x
.
status
)
active_jobs
=
status_group
.
get
(
1
,
[])
if
not
active_jobs
:
logger
.
info
(
"未获取到启用的爬虫任务"
)
return
# 获取已经注册的任务
jobs
:
List
[
ApschedulerJob
]
=
get_job_info
()
# 处理已经关闭的任务
regist_jobs
:
Dict
[
int
,
ApschedulerJob
]
=
Stream
(
jobs
)
.
group_by
(
lambda
x
:
x
.
taskId
)
for
job
in
status_group
.
get
(
0
,
[]):
if
job
.
id
in
regist_jobs
:
delete_job
(
job
.
id
)
logger
.
info
(
f
"删除任务成功,任务id:{job.id}"
)
# 处理新的定时任务
for
job
in
status_group
.
get
(
1
,
[]):
# 构建参数
# 获得今天的日期与昨天的日期 格式是 yyyy-MM-dd
dates
=
get_dates
()
types
=
[
TypeEnum
.
EPISODES
,
TypeEnum
.
MOVIE
,
TypeEnum
.
ANIME
,
TypeEnum
.
VARIETY
]
md5
=
calculate_md5
(
job
.
id
,
job
.
target_url
,
job
.
param_info
,
job
.
cron
)
# 计算
param
=
SpiderParams
(
startDate
=
dates
[
1
],
endDate
=
dates
[
0
],
target_type
=
types
,
url
=
job
.
target_url
,
)
taskInfo
=
TaskInfo
(
taskStart
=
True
,
executeStrategy
=
job
.
cron
,
taskId
=
job
.
id
,
url
=
job
.
target_url
,
taskMd5
=
md5
,
)
if
job
.
id
not
in
regist_jobs
:
add_guduo_job
(
param
,
taskInfo
)
logger
.
info
(
f
"添加任务成功,任务id:{job.id}"
)
else
:
exists_jobs
:
List
[
ApschedulerJob
]
=
get_job_info
(
job
.
id
)
if
exists_jobs
[
0
]
.
taskMd5
!=
md5
:
delete_job
(
job
.
id
)
add_guduo_job
(
param
,
taskInfo
)
logger
.
info
(
f
"更新任务成功,任务id:{job.id}"
)
logger
.
info
(
"爬虫配置完成"
)
def
calculate_md5
(
*
fields
):
"""
计算多个字段的MD5值
:param fields: 多个字段
:return: MD5值
"""
md5
=
hashlib
.
md5
()
for
field
in
fields
:
md5
.
update
(
str
(
field
)
.
encode
(
"utf-8"
))
return
md5
.
hexdigest
()
def
get_dates
()
->
Tuple
[
str
,
str
]:
"""
获取今天与昨天的日期 格式是 yyyy-MM-dd
:return: Tuple[str, str]
"""
today
=
datetime
.
today
()
yesterday
=
today
-
timedelta
(
days
=
1
)
today_str
=
today
.
strftime
(
"
%
Y-
%
m-
%
d"
)
yesterday_str
=
yesterday
.
strftime
(
"
%
Y-
%
m-
%
d"
)
return
today_str
,
yesterday_str
if
__name__
==
"__main__"
:
new_dict
=
{}
print
(
new_dict
.
get
(
"a"
))
print
(
hash
((
1
,
"b"
,
True
)))
new_dict
.
update
({
"a"
:
1
})
print
(
"a"
in
new_dict
)
...
...
app/model/mysql_model.py
View file @
50c996c
...
...
@@ -6,13 +6,54 @@ from app.schemas.spider_schema import TypeEnum
class
SpiderModel
(
Model
):
"""
爬虫结果数据模型
"""
id
=
fields
.
IntField
(
pk
=
True
)
time
=
fields
.
CharField
(
max_length
=
10
,
description
=
"数据时间 yyyy-MM-dd 格式的字符"
,
regex
=
r'^\d{4}-\d{2}-\d{2}$'
)
targetType
=
fields
.
IntEnumField
(
TypeEnum
,
description
=
"数据类型"
,
source_field
=
'target_type'
)
time
=
fields
.
CharField
(
max_length
=
10
,
description
=
"数据时间 yyyy-MM-dd 格式的字符"
,
regex
=
r"^\d{4}-\d{2}-\d{2}$"
,
)
targetType
=
fields
.
IntEnumField
(
TypeEnum
,
description
=
"数据类型"
,
source_field
=
"target_type"
)
platform
=
fields
.
CharField
(
max_length
=
255
,
description
=
"平台名字"
)
sourceName
=
fields
.
CharField
(
max_length
=
255
,
description
=
"媒体资源名字"
,
source_field
=
'source_name'
)
sourceName
=
fields
.
CharField
(
max_length
=
255
,
description
=
"媒体资源名字"
,
source_field
=
"source_name"
)
score
=
fields
.
FloatField
(
description
=
"热度得分"
)
createTime
=
fields
.
DatetimeField
(
default
=
datetime
.
now
,
description
=
"创建时间"
,
source_field
=
'create_time'
)
updateTime
=
fields
.
DatetimeField
(
default
=
datetime
.
now
,
description
=
"更新时间"
,
source_field
=
'update_time'
)
createTime
=
fields
.
DatetimeField
(
default
=
datetime
.
now
,
description
=
"创建时间"
,
source_field
=
"create_time"
)
updateTime
=
fields
.
DatetimeField
(
default
=
datetime
.
now
,
description
=
"更新时间"
,
source_field
=
"update_time"
)
class
Meta
:
table
=
"spider_data"
class
XWebCrawler
(
Model
):
id
=
fields
.
BigIntField
(
pk
=
True
,
description
=
"ID"
)
code
=
fields
.
CharField
(
max_length
=
64
,
unique
=
True
,
description
=
"标识(crawl_)"
)
name
=
fields
.
CharField
(
max_length
=
255
,
description
=
"名称"
)
description
=
fields
.
TextField
(
null
=
True
,
description
=
"描述"
)
type
=
fields
.
IntField
(
default
=
0
,
description
=
"类型0:普通;(待扩展)"
)
proxy_mode
=
fields
.
IntField
(
default
=
0
,
description
=
"代理模式0:不使用代理;(待扩展)"
)
status
=
fields
.
IntField
(
description
=
"状态 0:关闭;1:启用;"
)
cron
=
fields
.
CharField
(
max_length
=
32
,
description
=
"执行周期,标准cron表达式(json)"
)
param_info
=
fields
.
TextField
(
null
=
True
,
description
=
"参数信息"
)
target_url
=
fields
.
CharField
(
max_length
=
1000
,
null
=
True
,
description
=
"目标地址"
)
sequence
=
fields
.
IntField
(
default
=
0
,
description
=
"显示顺序"
)
create_time
=
fields
.
DatetimeField
(
null
=
True
,
description
=
"创建时间"
)
update_time
=
fields
.
DatetimeField
(
null
=
True
,
description
=
"更新时间"
)
class
Meta
:
table
=
"x_web_crawler"
table_description
=
"爬虫表"
...
...
app/schemas/config_schema.py
View file @
50c996c
...
...
@@ -48,3 +48,4 @@ class Port(BaseModel):
class
AppConfig
(
BaseModel
):
server
:
Port
getSpider
:
Dict
[
str
,
int
]
...
...
app/schemas/spider_schema.py
View file @
50c996c
...
...
@@ -81,6 +81,7 @@ class SpiderParams(BaseModel):
class
ApschedulerJob
(
BaseModel
):
jobId
:
Job
taskId
:
int
taskMd5
:
str
class
Config
:
arbitrary_types_allowed
=
True
...
...
@@ -91,3 +92,4 @@ class TaskInfo(BaseModel):
executeStrategy
:
str
taskId
:
int
url
:
str
taskMd5
:
str
...
...
app/service/spider_job_service.py
View file @
50c996c
...
...
@@ -40,7 +40,7 @@ def add_job(params: List[Any], taskInfo: TaskInfo, task: Any):
week
=
weekend
,
args
=
params
,
)
jobs
.
append
(
ApschedulerJob
(
jobId
=
job
,
taskId
=
taskInfo
.
taskId
))
jobs
.
append
(
ApschedulerJob
(
jobId
=
job
,
taskId
=
taskInfo
.
taskId
,
taskMd5
=
taskInfo
.
md5
()
))
logger
.
info
(
f
"添加任务成功,任务id:{taskInfo.taskId},任务执行策略:{taskInfo.executeStrategy}"
)
...
...
@@ -64,8 +64,10 @@ def delete_job(taskId: int):
logger
.
info
(
f
"删除任务成功,任务id:{taskId}"
)
def
get_job_info
(
taskId
:
int
):
job
=
Stream
(
jobs
)
.
filter
(
lambda
x
:
x
.
taskId
==
taskId
)
.
find_first
()
def
get_job_info
(
taskId
:
int
)
->
List
[
ApschedulerJob
]:
if
taskId
is
None
:
return
jobs
job
=
Stream
(
jobs
)
.
filter
(
lambda
x
:
x
.
taskId
==
taskId
)
.
to_list
()
return
f
"job 信息->{job}"
...
...
Please
register
or
sign in
to post a comment