spider_schema.py
2.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from datetime import timedelta, datetime
from enum import IntEnum
from typing import List
from apscheduler.job import Job
from pydantic import BaseModel, Field
class TypeEnum(IntEnum):
EPISODES = 1 # 剧集
MOVIE = 2 # 电影
ANIME = 3 # 动漫
def get_precise_positioning(self, other) -> List[str]:
if self.ANIME == other:
return ['.cateIcon_5',
'li.anime-platform:nth-child(1)']
if self.MOVIE == other:
return ['.cateIcon_3',
'ul.type-box:nth-child(1) > li:nth-child(1)']
if self.EPISODES == other:
return ['.cateIcon_1',
'div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)',
'ul.type-box:nth-child(1) > li:nth-child(1)']
class GuoDuoSpiderResult(BaseModel):
# time 数据时间 yyyy-MM-dd 格式的字符窜
time: str = Field(..., description="数据时间 yyyy-MM-dd 格式的字符", pattern='^\d{4}-\d{2}-\d{2}$')
# type '1剧集 2电影 3 动漫',
targetType: TypeEnum = Field(..., description="数据类型")
# platform 平台名字
platform: str
# source_name 媒体资源名字
sourceName: str
# 热度得分
score: float
# 创建时间
createTime: datetime
class Config:
from_attributes = False
class SpiderParams(BaseModel):
startDate: str = Field(..., description="数据时间 yyyy-MM-dd 格式的字符", pattern='^\d{4}-\d{2}-\d{2}$')
endDate: str = Field(..., description="数据时间 yyyy-MM-dd 格式的字符", pattern='^\d{4}-\d{2}-\d{2}$')
# type '1剧集 2电影 3 动漫',
target_type: List[TypeEnum]
# 爬取的地址
url: str
class Config:
from_attributes = False
def get_time_range(self) -> List[str]:
start_date = self.startDate
end_date = self.endDate
time_range = []
start_date_obj = datetime.strptime(start_date, '%Y-%m-%d')
end_date_obj = datetime.strptime(end_date, '%Y-%m-%d')
delta = end_date_obj - start_date_obj
for i in range(delta.days + 1):
day = start_date_obj + timedelta(days=i)
time_range.append(day.strftime('%Y-%m-%d'))
return time_range
class ApschedulerJob(BaseModel):
jobId: Job
taskId: int
class Config:
arbitrary_types_allowed = True
class TaskInfo(BaseModel):
taskStart: bool
executeStrategy: str
taskId: int
url: str