spider_schema.py 3.69 KB
from datetime import timedelta, datetime
from enum import IntEnum
from typing import List, Optional

from apscheduler.job import Job
from pydantic import BaseModel, Field


class TypeEnum(IntEnum):
    """
    数据类型 1 电视剧 2 电影 3 综艺 4 动漫
    """
    EPISODES = 1  # 电视剧
    MOVIE = 2  # 电影
    ANIME = 4  # 动漫
    VARIETY = 3  # 综艺

    def get_precise_positioning(self, other) -> List[str]:
        if self.ANIME == other:
            return ["html body div#app.wrap div.content.index div.content-box div.category.category-nav ul li.active",
                    "li.anime-platform:nth-child(1)"]
        if self.MOVIE == other:
            return ["li.active:nth-child(3)",
                    "ul.type-box:nth-child(1) > li:nth-child(1)"]
        if self.EPISODES == other:
            return [
                "div.category:nth-child(3) > ul:nth-child(1) > li:nth-child(1)",
                "div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)",
                "ul.type-box:nth-child(1) > li:nth-child(1)",
            ]
        if self.VARIETY == other:
            return [
                "div.category:nth-child(3) > ul:nth-child(1) > li:nth-child(2)",
                "div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)",
                "ul.type-box:nth-child(1) > li:nth-child(1)"
            ]


class GuoDuoSpiderResult(BaseModel):
    # time 数据时间 yyyy-MM-dd 格式的字符窜
    time: str = Field(
        ..., description="数据时间 yyyy-MM-dd 格式的字符", pattern="^\d{4}-\d{2}-\d{2}$"
    )
    # type  '1剧集 2电影 3 动漫 4综艺',
    targetType: TypeEnum = Field(..., description="数据类型")
    # platform  平台名字
    platform: str
    # source_name 媒体资源名字
    sourceName: str
    # 热度得分
    score: float
    # 创建时间
    createTime: datetime

    class Config:
        from_attributes = False


class SpiderParams(BaseModel):
    startDate: str = Field(
        ..., description="数据时间 yyyy-MM-dd 格式的字符", pattern="^\d{4}-\d{2}-\d{2}$"
    )
    endDate: str = Field(
        ..., description="数据时间 yyyy-MM-dd 格式的字符", pattern="^\d{4}-\d{2}-\d{2}$"
    )
    # type  '1剧集 2电影 3 动漫',
    target_type: List[TypeEnum]
    # 爬取的地址
    url: str

    class Config:
        from_attributes = False

    def get_time_range(self) -> List[str]:
        start_date = self.startDate
        end_date = self.endDate
        time_range = []
        start_date_obj = datetime.strptime(start_date, "%Y-%m-%d")
        end_date_obj = datetime.strptime(end_date, "%Y-%m-%d")
        delta = end_date_obj - start_date_obj
        for i in range(delta.days + 1):
            day = start_date_obj + timedelta(days=i)
            time_range.append(day.strftime("%Y-%m-%d"))
        return time_range


class ApschedulerJob(BaseModel):
    jobId: Job
    taskId: int
    taskMd5: str

    class Config:
        arbitrary_types_allowed = True


class TaskInfo(BaseModel):
    taskStart: bool
    executeStrategy: str
    taskId: int
    url: str
    taskMd5: str


class PlatformImage(BaseModel):
    url: str


class DataItem(BaseModel):
    showId: int | None
    name: str
    category: str | None
    releaseDate: int | None
    days: int | None
    offlineDate: int | None
    releaseStatus: int | None
    episode: int | None
    gdi: str | None
    gdiFloat: float
    attachData: str
    rank: int | None
    rise: int | None
    platformImgs: List[str] | None
    topHotCount: Optional[int] = None
    topHotCountDay: Optional[int] = None
    totalPlayCount: int | None


class ResponseModel(BaseModel):
    code: int
    msg: str
    data: List[DataItem]
    extra: Optional[dict] = None