spider_schema.py
2.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from datetime import timedelta, datetime
from enum import IntEnum
from typing import List
from apscheduler.job import Job
from pydantic import BaseModel, Field
class TypeEnum(IntEnum):
EPISODES = 1 # 剧集
MOVIE = 2 # 电影
ANIME = 3 # 动漫
VARIETY = 4 # 综艺
def get_precise_positioning(self, other) -> List[str]:
if self.ANIME == other:
return [".cateIcon_5", "li.anime-platform:nth-child(1)"]
if self.MOVIE == other:
return [".cateIcon_3", "ul.type-box:nth-child(1) > li:nth-child(1)"]
if self.EPISODES == other:
return [
".cateIcon_1",
"div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)",
"ul.type-box:nth-child(1) > li:nth-child(1)",
]
if self.VARIETY == other:
return [
"div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)",
"ul.type-box:nth-child(1) > li:nth-child(1)",
]
class GuoDuoSpiderResult(BaseModel):
# time 数据时间 yyyy-MM-dd 格式的字符窜
time: str = Field(
..., description="数据时间 yyyy-MM-dd 格式的字符", pattern="^\d{4}-\d{2}-\d{2}$"
)
# type '1剧集 2电影 3 动漫 4综艺',
targetType: TypeEnum = Field(..., description="数据类型")
# platform 平台名字
platform: str
# source_name 媒体资源名字
sourceName: str
# 热度得分
score: float
# 创建时间
createTime: datetime
class Config:
from_attributes = False
class SpiderParams(BaseModel):
startDate: str = Field(
..., description="数据时间 yyyy-MM-dd 格式的字符", pattern="^\d{4}-\d{2}-\d{2}$"
)
endDate: str = Field(
..., description="数据时间 yyyy-MM-dd 格式的字符", pattern="^\d{4}-\d{2}-\d{2}$"
)
# type '1剧集 2电影 3 动漫',
target_type: List[TypeEnum]
# 爬取的地址
url: str
class Config:
from_attributes = False
def get_time_range(self) -> List[str]:
start_date = self.startDate
end_date = self.endDate
time_range = []
start_date_obj = datetime.strptime(start_date, "%Y-%m-%d")
end_date_obj = datetime.strptime(end_date, "%Y-%m-%d")
delta = end_date_obj - start_date_obj
for i in range(delta.days + 1):
day = start_date_obj + timedelta(days=i)
time_range.append(day.strftime("%Y-%m-%d"))
return time_range
class ApschedulerJob(BaseModel):
jobId: Job
taskId: int
taskMd5: str
class Config:
arbitrary_types_allowed = True
class TaskInfo(BaseModel):
taskStart: bool
executeStrategy: str
taskId: int
url: str
taskMd5: str