spider_schema.py
3.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from datetime import timedelta, datetime
from enum import IntEnum
from typing import List, Optional
from apscheduler.job import Job
from pydantic import BaseModel, Field
class TypeEnum(IntEnum):
"""
数据类型 1 电视剧 2 电影 3 综艺 4 动漫
"""
EPISODES = 1 # 电视剧
MOVIE = 2 # 电影
ANIME = 4 # 动漫
VARIETY = 3 # 综艺
def get_precise_positioning(self, other) -> List[str]:
if self.ANIME == other:
return ["html body div#app.wrap div.content.index div.content-box div.category.category-nav ul li.active",
"li.anime-platform:nth-child(1)"]
if self.MOVIE == other:
return ["li.active:nth-child(3)",
"ul.type-box:nth-child(1) > li:nth-child(1)"]
if self.EPISODES == other:
return [
"div.category:nth-child(3) > ul:nth-child(1) > li:nth-child(1)",
"div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)",
"ul.type-box:nth-child(1) > li:nth-child(1)",
]
if self.VARIETY == other:
return [
"div.category:nth-child(3) > ul:nth-child(1) > li:nth-child(2)",
"div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)",
"ul.type-box:nth-child(1) > li:nth-child(1)"
]
class GuoDuoSpiderResult(BaseModel):
# time 数据时间 yyyy-MM-dd 格式的字符窜
time: str = Field(
..., description="数据时间 yyyy-MM-dd 格式的字符", pattern="^\d{4}-\d{2}-\d{2}$"
)
# type '1剧集 2电影 3 动漫 4综艺',
targetType: TypeEnum = Field(..., description="数据类型")
# platform 平台名字
platform: str
# source_name 媒体资源名字
sourceName: str
# 热度得分
score: float
# 创建时间
createTime: datetime
class Config:
from_attributes = False
class SpiderParams(BaseModel):
startDate: str = Field(
..., description="数据时间 yyyy-MM-dd 格式的字符", pattern="^\d{4}-\d{2}-\d{2}$"
)
endDate: str = Field(
..., description="数据时间 yyyy-MM-dd 格式的字符", pattern="^\d{4}-\d{2}-\d{2}$"
)
# type '1剧集 2电影 3 动漫',
target_type: List[TypeEnum]
# 爬取的地址
url: str
class Config:
from_attributes = False
def get_time_range(self) -> List[str]:
start_date = self.startDate
end_date = self.endDate
time_range = []
start_date_obj = datetime.strptime(start_date, "%Y-%m-%d")
end_date_obj = datetime.strptime(end_date, "%Y-%m-%d")
delta = end_date_obj - start_date_obj
for i in range(delta.days + 1):
day = start_date_obj + timedelta(days=i)
time_range.append(day.strftime("%Y-%m-%d"))
return time_range
class ApschedulerJob(BaseModel):
jobId: Job
taskId: int
taskMd5: str
class Config:
arbitrary_types_allowed = True
class TaskInfo(BaseModel):
taskStart: bool
executeStrategy: str
taskId: int
url: str
taskMd5: str
class PlatformImage(BaseModel):
url: str
class DataItem(BaseModel):
showId: int | None
name: str
category: str | None
releaseDate: int | None
days: int | None
offlineDate: int | None
releaseStatus: int | None
episode: int | None
gdi: str | None
gdiFloat: float
attachData: str
rank: int | None
rise: int | None
platformImgs: List[str] | None
topHotCount: Optional[int] = None
topHotCountDay: Optional[int] = None
totalPlayCount: int | None
class ResponseModel(BaseModel):
code: int
msg: str
data: List[DataItem]
extra: Optional[dict] = None