spider_schema.py 2.43 KB
from datetime import timedelta, datetime
from enum import IntEnum
from typing import List

from apscheduler.job import Job
from pydantic import BaseModel, Field


class TypeEnum(IntEnum):
    EPISODES = 1  # 剧集
    MOVIE = 2  # 电影
    ANIME = 3  # 动漫

    def get_precise_positioning(self, other) -> List[str]:
        if self.ANIME == other:
            return ['.cateIcon_5',
                    'li.anime-platform:nth-child(1)']
        if self.MOVIE == other:
            return ['.cateIcon_3',
                    'ul.type-box:nth-child(1) > li:nth-child(1)']
        if self.EPISODES == other:
            return ['.cateIcon_1',
                    'div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)',
                    'ul.type-box:nth-child(1) > li:nth-child(1)']


class GuoDuoSpiderResult(BaseModel):
    # time 数据时间 yyyy-MM-dd 格式的字符窜
    time: str = Field(..., description="数据时间 yyyy-MM-dd 格式的字符", pattern='^\d{4}-\d{2}-\d{2}$')
    # type  '1剧集 2电影 3 动漫',
    targetType: TypeEnum = Field(..., description="数据类型")
    # platform  平台名字
    platform: str
    # source_name 媒体资源名字
    sourceName: str
    # 热度得分
    score: float
    # 创建时间
    createTime: datetime

    class Config:
        from_attributes = False


class SpiderParams(BaseModel):
    startDate: str = Field(..., description="数据时间 yyyy-MM-dd 格式的字符", pattern='^\d{4}-\d{2}-\d{2}$')
    endDate: str = Field(..., description="数据时间 yyyy-MM-dd 格式的字符", pattern='^\d{4}-\d{2}-\d{2}$')
    # type  '1剧集 2电影 3 动漫',
    target_type: List[TypeEnum]
    # 爬取的地址
    url: str

    class Config:
        from_attributes = False

    def get_time_range(self) -> List[str]:
        start_date = self.startDate
        end_date = self.endDate
        time_range = []
        start_date_obj = datetime.strptime(start_date, '%Y-%m-%d')
        end_date_obj = datetime.strptime(end_date, '%Y-%m-%d')
        delta = end_date_obj - start_date_obj
        for i in range(delta.days + 1):
            day = start_date_obj + timedelta(days=i)
            time_range.append(day.strftime('%Y-%m-%d'))
        return time_range


class ApschedulerJob(BaseModel):
    jobId: Job
    taskId: int

    class Config:
        arbitrary_types_allowed = True


class TaskInfo(BaseModel):
    taskStart: bool
    executeStrategy: str
    taskId: int
    url: str