spider_schema.py 4.48 KB
from datetime import timedelta, datetime
from enum import IntEnum
from typing import List, Optional

from apscheduler.job import Job
from pydantic import BaseModel, Field


class TypeEnum(IntEnum):
    """
    数据类型 1 电视剧 2 电影 3 综艺 4 动漫
    """
    EPISODES = 1  # 电视剧
    MOVIE = 2  # 电影
    ANIME = 4  # 动漫
    VARIETY = 3  # 综艺

    def get_precise_positioning(self, other) -> List[str]:
        if self.ANIME == other:
            return ["html body div#app.wrap div.content.index div.content-box div.category.category-nav ul li.active",
                    "li.anime-platform:nth-child(1)"]
        if self.MOVIE == other:
            return ["li.active:nth-child(3)",
                    "ul.type-box:nth-child(1) > li:nth-child(1)"]
        if self.EPISODES == other:
            return [
                "div.category:nth-child(3) > ul:nth-child(1) > li:nth-child(1)",
                "div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)",
                "ul.type-box:nth-child(1) > li:nth-child(1)",
            ]
        if self.VARIETY == other:
            return [
                "div.category:nth-child(3) > ul:nth-child(1) > li:nth-child(2)",
                "div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)",
                "ul.type-box:nth-child(1) > li:nth-child(1)"
            ]


class GuoDuoSpiderResult(BaseModel):
    # time 数据时间 yyyy-MM-dd 格式的字符窜
    time: str = Field(
        ..., description="数据时间 yyyy-MM-dd 格式的字符", pattern="^\d{4}-\d{2}-\d{2}$"
    )
    # type  '1剧集 2电影 3 动漫 4综艺',
    targetType: TypeEnum = Field(..., description="数据类型")
    # platform  平台名字
    platform: str
    # source_name 媒体资源名字
    sourceName: str
    # 热度得分
    score: float
    # 创建时间
    createTime: datetime

    class Config:
        from_attributes = False


class SpiderParams(BaseModel):
    startDate: str = Field(
        ..., description="数据时间 yyyy-MM-dd 格式的字符", pattern="^\d{4}-\d{2}-\d{2}$"
    )
    endDate: str = Field(
        ..., description="数据时间 yyyy-MM-dd 格式的字符", pattern="^\d{4}-\d{2}-\d{2}$"
    )
    # type  '1剧集 2电影 3 动漫',
    target_type: List[TypeEnum]
    # 爬取的地址
    url: str

    class Config:
        from_attributes = False

    def get_time_range(self) -> List[str]:
        start_date = self.startDate
        end_date = self.endDate
        time_range = []
        start_date_obj = datetime.strptime(start_date, "%Y-%m-%d")
        end_date_obj = datetime.strptime(end_date, "%Y-%m-%d")
        delta = end_date_obj - start_date_obj
        for i in range(delta.days + 1):
            day = start_date_obj + timedelta(days=i)
            time_range.append(day.strftime("%Y-%m-%d"))
        return time_range


class ApschedulerJob(BaseModel):
    jobId: Job
    taskId: int
    taskMd5: str

    class Config:
        arbitrary_types_allowed = True


class TaskInfo(BaseModel):
    taskStart: bool
    executeStrategy: str
    taskId: int
    url: str
    taskMd5: str


class PlatformImage(BaseModel):
    url: str


class DataItem(BaseModel):
    showId: int | None
    name: str
    category: str | None
    releaseDate: int | None
    days: int | None
    offlineDate: int | None
    releaseStatus: int | None
    episode: int | None
    gdi: str | None
    gdiFloat: float
    attachData: str
    rank: int | None
    rise: int | None
    platformImgs: List[str] | None
    topHotCount: Optional[int] = None
    topHotCountDay: Optional[int] = None
    totalPlayCount: int | None


class ResponseModel(BaseModel):
    code: int
    msg: str
    data: List[DataItem]
    extra: Optional[dict] = None


if __name__ == "__main__":
    from datetime import datetime

    # 时间戳(以毫秒为单位)
    release_date_timestamp = 1573488000000
    offline_date_timestamp = 1576080000000


    # 将时间戳转换为datetime对象,并转换为'yyyy-MM-dd'格式
    def timestamp_to_date_string(timestamp_ms):
        # Convert from milliseconds to seconds by dividing by 1000, then use fromtimestamp
        dt_object = datetime.fromtimestamp(timestamp_ms / 1000)
        return dt_object.strftime('%Y-%m-%d')


    # 调用函数并打印结果
    formatted_release_date = timestamp_to_date_string(release_date_timestamp)
    formatted_offline_date = timestamp_to_date_string(offline_date_timestamp)

    print(f"Release Date: {formatted_release_date}")
    print(f"Offline Date: {formatted_offline_date}")