从接口获取数据
Showing
13 changed files
with
275 additions
and
51 deletions
| ... | @@ -3,5 +3,5 @@ | ... | @@ -3,5 +3,5 @@ |
| 3 | <component name="Black"> | 3 | <component name="Black"> |
| 4 | <option name="sdkName" value="Spider" /> | 4 | <option name="sdkName" value="Spider" /> |
| 5 | </component> | 5 | </component> |
| 6 | <component name="ProjectRootManager" version="2" project-jdk-name="spider" project-jdk-type="Python SDK" /> | 6 | <component name="ProjectRootManager" version="2" project-jdk-name="Spider" project-jdk-type="Python SDK" /> |
| 7 | </project> | 7 | </project> |
| ... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
| ... | @@ -5,16 +5,15 @@ tortoise: | ... | @@ -5,16 +5,15 @@ tortoise: |
| 5 | default: | 5 | default: |
| 6 | engine: tortoise.backends.mysql | 6 | engine: tortoise.backends.mysql |
| 7 | credentials: | 7 | credentials: |
| 8 | database: ams_test | 8 | database: spider |
| 9 | host: 172.0.31.57 | 9 | host: 192.168.1.91 |
| 10 | password: Tjlh@2017 | 10 | password: Tjlh@2023 |
| 11 | port: 3306 | 11 | port: 3306 |
| 12 | user: root | 12 | user: root |
| 13 | minsize: 10 | 13 | minsize: 10 |
| 14 | maxsize: 200 | 14 | maxsize: 200 |
| 15 | connect_timeout: 30 | 15 | connect_timeout: 30 |
| 16 | echo: True | 16 | echo: True |
| 17 | timezone: Asia/Shanghai | ||
| 18 | apps: | 17 | apps: |
| 19 | models_read: | 18 | models_read: |
| 20 | models: | 19 | models: |
| ... | @@ -24,4 +23,14 @@ tortoise: | ... | @@ -24,4 +23,14 @@ tortoise: |
| 24 | 23 | ||
| 25 | getSpider: | 24 | getSpider: |
| 26 | # 设置设置爬虫定时任务时间间隔 单位是分钟 | 25 | # 设置设置爬虫定时任务时间间隔 单位是分钟 |
| 27 | interval: 5 | 26 | interval: 1 |
| 27 | urlTemplate: | ||
| 28 | # 1 电视剧 2 电影 3 综艺 4 动漫 | ||
| 29 | - type: 1 | ||
| 30 | template: "/m/v3/billboard/list?type=DAILY&category=ALL_ANIME&date={date}&attach=gdi&orderTitle=gdi&platformId=0" | ||
| 31 | - type: 2 | ||
| 32 | template: "/m/v3/billboard/list?type=DAILY&category=NETWORK_MOVIE&date={date}&attach=gdi&orderTitle=gdi&platformId=0" | ||
| 33 | - type: 3 | ||
| 34 | template: "/m/v3/billboard/list?type=DAILY&category=NETWORK_VARIETY&date={date}&attach=gdi&orderTitle=gdi&platformId=0" | ||
| 35 | - type: 4 | ||
| 36 | template: "/m/v3/billboard/list?type=DAILY&category=ALL_ANIME&date={date}&attach=gdi&orderTitle=gdi&platformId=0" | ... | ... |
| ... | @@ -8,8 +8,9 @@ import logging | ... | @@ -8,8 +8,9 @@ import logging |
| 8 | 8 | ||
| 9 | logger = logging.getLogger(__name__) | 9 | logger = logging.getLogger(__name__) |
| 10 | 10 | ||
| 11 | |||
| 11 | @functools.lru_cache() | 12 | @functools.lru_cache() |
| 12 | def getAppConfig()->AppConfig: | 13 | def getAppConfig() -> AppConfig: |
| 13 | logger.info('开始加载AppConfig') | 14 | logger.info('开始加载AppConfig') |
| 14 | # 获取当前文件的绝对路径 | 15 | # 获取当前文件的绝对路径 |
| 15 | current_file_path = os.path.abspath(__file__) | 16 | current_file_path = os.path.abspath(__file__) | ... | ... |
| 1 | from datetime import datetime, timedelta | ||
| 2 | import hashlib | 1 | import hashlib |
| 3 | import logging | 2 | import logging |
| 3 | from datetime import datetime, timedelta | ||
| 4 | from typing import Dict, List, Tuple | 4 | from typing import Dict, List, Tuple |
| 5 | from app.model.mysql_model import XWebCrawler | 5 | |
| 6 | from apscheduler.schedulers.asyncio import AsyncIOScheduler | 6 | from apscheduler.schedulers.asyncio import AsyncIOScheduler |
| 7 | from superstream import Stream | 7 | from superstream import Stream |
| 8 | |||
| 8 | from app.config.app_config import getAppConfig | 9 | from app.config.app_config import getAppConfig |
| 10 | from app.model.mysql_model import XWebCrawler | ||
| 9 | from app.schemas.spider_schema import ApschedulerJob, SpiderParams, TaskInfo, TypeEnum | 11 | from app.schemas.spider_schema import ApschedulerJob, SpiderParams, TaskInfo, TypeEnum |
| 10 | 12 | ||
| 11 | logger = logging.getLogger(__name__) | 13 | logger = logging.getLogger(__name__) |
| ... | @@ -19,7 +21,6 @@ scheduler = AsyncIOScheduler() | ... | @@ -19,7 +21,6 @@ scheduler = AsyncIOScheduler() |
| 19 | @scheduler.scheduled_job("interval", minutes=getAppConfig().getSpider.get("interval")) | 21 | @scheduler.scheduled_job("interval", minutes=getAppConfig().getSpider.get("interval")) |
| 20 | async def get_spider_config(): | 22 | async def get_spider_config(): |
| 21 | from app.service.spider_job_service import ( | 23 | from app.service.spider_job_service import ( |
| 22 | add_job, | ||
| 23 | add_guduo_job, | 24 | add_guduo_job, |
| 24 | delete_job, | 25 | delete_job, |
| 25 | get_job_info, | 26 | get_job_info, |
| ... | @@ -40,9 +41,9 @@ async def get_spider_config(): | ... | @@ -40,9 +41,9 @@ async def get_spider_config(): |
| 40 | # 获取已经注册的任务 | 41 | # 获取已经注册的任务 |
| 41 | jobs: List[ApschedulerJob] = get_job_info() | 42 | jobs: List[ApschedulerJob] = get_job_info() |
| 42 | # 处理已经关闭的任务 | 43 | # 处理已经关闭的任务 |
| 43 | regist_jobs: Dict[int, ApschedulerJob] = Stream(jobs).group_by(lambda x: x.taskId) | 44 | registered_jobs: Dict[int, ApschedulerJob] = Stream(jobs).to_dict(lambda x: x.taskId, lambda y: y) |
| 44 | for job in status_group.get(0, []): | 45 | for job in status_group.get(0, []): |
| 45 | if job.id in regist_jobs: | 46 | if job.id in registered_jobs: |
| 46 | delete_job(job.id) | 47 | delete_job(job.id) |
| 47 | logger.info(f"删除任务成功,任务id:{job.id}") | 48 | logger.info(f"删除任务成功,任务id:{job.id}") |
| 48 | # 处理新的定时任务 | 49 | # 处理新的定时任务 |
| ... | @@ -66,7 +67,7 @@ async def get_spider_config(): | ... | @@ -66,7 +67,7 @@ async def get_spider_config(): |
| 66 | url=job.target_url, | 67 | url=job.target_url, |
| 67 | taskMd5=md5, | 68 | taskMd5=md5, |
| 68 | ) | 69 | ) |
| 69 | if job.id not in regist_jobs: | 70 | if job.id not in registered_jobs: |
| 70 | add_guduo_job(param, taskInfo) | 71 | add_guduo_job(param, taskInfo) |
| 71 | logger.info(f"添加任务成功,任务id:{job.id}") | 72 | logger.info(f"添加任务成功,任务id:{job.id}") |
| 72 | else: | 73 | else: | ... | ... |
| ... | @@ -15,7 +15,7 @@ handlers: | ... | @@ -15,7 +15,7 @@ handlers: |
| 15 | class: logging.handlers.TimedRotatingFileHandler | 15 | class: logging.handlers.TimedRotatingFileHandler |
| 16 | level: INFO | 16 | level: INFO |
| 17 | formatter: standard | 17 | formatter: standard |
| 18 | filename: 'logs/app.log' # 日志文件的路径和名称 | 18 | filename: '/Users/mac/PycharmProjects/pythonProject/FastAPIProject1/app/logs/app.log' # 日志文件的路径和名称 |
| 19 | when: midnight # 按天分割 | 19 | when: midnight # 按天分割 |
| 20 | interval: 1 # 每天分割一次 | 20 | interval: 1 # 每天分割一次 |
| 21 | backupCount: 7 # 保留最近7天的日志文件 | 21 | backupCount: 7 # 保留最近7天的日志文件 | ... | ... |
This diff could not be displayed because it is too large.
| ... | @@ -46,6 +46,12 @@ class Port(BaseModel): | ... | @@ -46,6 +46,12 @@ class Port(BaseModel): |
| 46 | port: int | 46 | port: int |
| 47 | 47 | ||
| 48 | 48 | ||
| 49 | class UrlTemplateInfo(BaseModel): | ||
| 50 | type: int | ||
| 51 | template: str | ||
| 52 | |||
| 53 | |||
| 49 | class AppConfig(BaseModel): | 54 | class AppConfig(BaseModel): |
| 50 | server: Port | 55 | server: Port |
| 51 | getSpider: Dict[str, int] | 56 | getSpider: Dict[str, int] |
| 57 | urlTemplate: List[UrlTemplateInfo] | ... | ... |
app/schemas/safe_contrainer.py
0 → 100644
| 1 | import threading | ||
| 2 | from typing import TypeVar, List, Dict, Generic | ||
| 3 | |||
| 4 | K = TypeVar('K') # 定义类型变量 | ||
| 5 | |||
| 6 | V = TypeVar('V') | ||
| 7 | |||
| 8 | |||
| 9 | class SafeDict(Generic[K, V]): | ||
| 10 | |||
| 11 | def __init__(self): | ||
| 12 | self.lock = threading.Lock() | ||
| 13 | self.data: Dict[K, V] = {} | ||
| 14 | |||
| 15 | def put(self, key: K, value: V): | ||
| 16 | with self.lock: | ||
| 17 | self.data.update({key: value}) | ||
| 18 | |||
| 19 | def remove(self, key: K): | ||
| 20 | with self.lock: | ||
| 21 | self.data.pop(key) | ||
| 22 | |||
| 23 | def values(self) -> List[V]: | ||
| 24 | with self.lock: | ||
| 25 | return list(self.data.values()) | ||
| 26 | |||
| 27 | def get(self, key: K) -> V: | ||
| 28 | with self.lock: | ||
| 29 | return self.data.get(key) | ||
| 30 | |||
| 31 | |||
| 32 | |||
| 33 | |||
| 34 | if __name__ == '__main__': | ||
| 35 | test: SafeDict[int, str] = SafeDict() | ||
| 36 | test.put(1, '1') | ||
| 37 | print(test.get(1)) | ||
| 38 | print(test.get(2)) | ||
| 39 | t2 = {} | ||
| 40 | print(t2.get(1)) |
| 1 | from datetime import timedelta, datetime | 1 | from datetime import timedelta, datetime |
| 2 | from enum import IntEnum | 2 | from enum import IntEnum |
| 3 | from typing import List | 3 | from typing import List, Optional |
| 4 | 4 | ||
| 5 | from apscheduler.job import Job | 5 | from apscheduler.job import Job |
| 6 | from pydantic import BaseModel, Field | 6 | from pydantic import BaseModel, Field |
| ... | @@ -17,19 +17,22 @@ class TypeEnum(IntEnum): | ... | @@ -17,19 +17,22 @@ class TypeEnum(IntEnum): |
| 17 | 17 | ||
| 18 | def get_precise_positioning(self, other) -> List[str]: | 18 | def get_precise_positioning(self, other) -> List[str]: |
| 19 | if self.ANIME == other: | 19 | if self.ANIME == other: |
| 20 | return [".cateIcon_5", "li.anime-platform:nth-child(1)"] | 20 | return ["html body div#app.wrap div.content.index div.content-box div.category.category-nav ul li.active", |
| 21 | "li.anime-platform:nth-child(1)"] | ||
| 21 | if self.MOVIE == other: | 22 | if self.MOVIE == other: |
| 22 | return [".cateIcon_3", "ul.type-box:nth-child(1) > li:nth-child(1)"] | 23 | return ["li.active:nth-child(3)", |
| 24 | "ul.type-box:nth-child(1) > li:nth-child(1)"] | ||
| 23 | if self.EPISODES == other: | 25 | if self.EPISODES == other: |
| 24 | return [ | 26 | return [ |
| 25 | ".cateIcon_1", | 27 | "div.category:nth-child(3) > ul:nth-child(1) > li:nth-child(1)", |
| 26 | "div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)", | 28 | "div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)", |
| 27 | "ul.type-box:nth-child(1) > li:nth-child(1)", | 29 | "ul.type-box:nth-child(1) > li:nth-child(1)", |
| 28 | ] | 30 | ] |
| 29 | if self.VARIETY == other: | 31 | if self.VARIETY == other: |
| 30 | return [ | 32 | return [ |
| 33 | "div.category:nth-child(3) > ul:nth-child(1) > li:nth-child(2)", | ||
| 31 | "div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)", | 34 | "div.category:nth-child(1) > ul:nth-child(1) > li:nth-child(1)", |
| 32 | "ul.type-box:nth-child(1) > li:nth-child(1)", | 35 | "ul.type-box:nth-child(1) > li:nth-child(1)" |
| 33 | ] | 36 | ] |
| 34 | 37 | ||
| 35 | 38 | ||
| ... | @@ -96,3 +99,57 @@ class TaskInfo(BaseModel): | ... | @@ -96,3 +99,57 @@ class TaskInfo(BaseModel): |
| 96 | taskId: int | 99 | taskId: int |
| 97 | url: str | 100 | url: str |
| 98 | taskMd5: str | 101 | taskMd5: str |
| 102 | |||
| 103 | |||
| 104 | class PlatformImage(BaseModel): | ||
| 105 | url: str | ||
| 106 | |||
| 107 | |||
| 108 | class DataItem(BaseModel): | ||
| 109 | showId: int | None | ||
| 110 | name: str | ||
| 111 | category: str | None | ||
| 112 | releaseDate: int | None | ||
| 113 | days: int | None | ||
| 114 | offlineDate: int | None | ||
| 115 | releaseStatus: int | None | ||
| 116 | episode: int | None | ||
| 117 | gdi: str | None | ||
| 118 | gdiFloat: float | ||
| 119 | attachData: str | ||
| 120 | rank: int | None | ||
| 121 | rise: int | None | ||
| 122 | platformImgs: List[str] | None | ||
| 123 | topHotCount: Optional[int] = None | ||
| 124 | topHotCountDay: Optional[int] = None | ||
| 125 | totalPlayCount: int | None | ||
| 126 | |||
| 127 | |||
| 128 | class ResponseModel(BaseModel): | ||
| 129 | code: int | ||
| 130 | msg: str | ||
| 131 | data: List[DataItem] | ||
| 132 | extra: Optional[dict] = None | ||
| 133 | |||
| 134 | |||
| 135 | if __name__ == "__main__": | ||
| 136 | from datetime import datetime | ||
| 137 | |||
| 138 | # 时间戳(以毫秒为单位) | ||
| 139 | release_date_timestamp = 1573488000000 | ||
| 140 | offline_date_timestamp = 1576080000000 | ||
| 141 | |||
| 142 | |||
| 143 | # 将时间戳转换为datetime对象,并转换为'yyyy-MM-dd'格式 | ||
| 144 | def timestamp_to_date_string(timestamp_ms): | ||
| 145 | # Convert from milliseconds to seconds by dividing by 1000, then use fromtimestamp | ||
| 146 | dt_object = datetime.fromtimestamp(timestamp_ms / 1000) | ||
| 147 | return dt_object.strftime('%Y-%m-%d') | ||
| 148 | |||
| 149 | |||
| 150 | # 调用函数并打印结果 | ||
| 151 | formatted_release_date = timestamp_to_date_string(release_date_timestamp) | ||
| 152 | formatted_offline_date = timestamp_to_date_string(offline_date_timestamp) | ||
| 153 | |||
| 154 | print(f"Release Date: {formatted_release_date}") | ||
| 155 | print(f"Offline Date: {formatted_offline_date}") | ... | ... |
| 1 | import asyncio | 1 | import asyncio |
| 2 | from datetime import datetime | ||
| 3 | import logging | 2 | import logging |
| 4 | from typing import List, Any | 3 | from datetime import datetime |
| 5 | from app.model.mysql_model import SpiderModel | 4 | from multiprocessing import Manager |
| 6 | from superstream import Stream | 5 | from typing import List, Any, Dict |
| 6 | from app.schemas.safe_contrainer import SafeDict | ||
| 7 | from app.job.job import scheduler | 7 | from app.job.job import scheduler |
| 8 | from app.model.mysql_model import SpiderModel | ||
| 8 | from app.schemas.spider_schema import ( | 9 | from app.schemas.spider_schema import ( |
| 9 | ApschedulerJob, | 10 | ApschedulerJob, |
| 10 | TaskInfo, | 11 | TaskInfo, |
| 11 | SpiderParams, | 12 | SpiderParams, |
| 12 | GuoDuoSpiderResult, | 13 | GuoDuoSpiderResult, |
| 13 | ) | 14 | ) |
| 14 | from app.spider.guduo_spider import startBrowser | 15 | from app.spider.http_spider import get_score_data |
| 15 | 16 | ||
| 16 | logger = logging.getLogger(__name__) | 17 | logger = logging.getLogger(__name__) |
| 17 | jobs: List[ApschedulerJob] = [] | 18 | jobs: SafeDict[int, ApschedulerJob] = SafeDict() |
| 18 | 19 | ||
| 19 | 20 | ||
| 20 | def add_job(params: List[Any], taskInfo: TaskInfo, task: Any): | 21 | def add_job(params: List[Any], taskInfo: TaskInfo, task: Any): |
| ... | @@ -40,7 +41,9 @@ def add_job(params: List[Any], taskInfo: TaskInfo, task: Any): | ... | @@ -40,7 +41,9 @@ def add_job(params: List[Any], taskInfo: TaskInfo, task: Any): |
| 40 | week=weekend, | 41 | week=weekend, |
| 41 | args=params, | 42 | args=params, |
| 42 | ) | 43 | ) |
| 43 | jobs.append(ApschedulerJob(jobId=job, taskId=taskInfo.taskId,taskMd5=taskInfo.md5())) | 44 | jobs.put(taskInfo.taskId, |
| 45 | ApschedulerJob(jobId=job, taskId=taskInfo.taskId, taskMd5=taskInfo.taskMd5) | ||
| 46 | ) | ||
| 44 | logger.info( | 47 | logger.info( |
| 45 | f"添加任务成功,任务id:{taskInfo.taskId},任务执行策略:{taskInfo.executeStrategy}" | 48 | f"添加任务成功,任务id:{taskInfo.taskId},任务执行策略:{taskInfo.executeStrategy}" |
| 46 | ) | 49 | ) |
| ... | @@ -57,26 +60,21 @@ def add_guduo_job(taskParam: SpiderParams, taskInfo: TaskInfo): | ... | @@ -57,26 +60,21 @@ def add_guduo_job(taskParam: SpiderParams, taskInfo: TaskInfo): |
| 57 | 60 | ||
| 58 | 61 | ||
| 59 | def delete_job(taskId: int): | 62 | def delete_job(taskId: int): |
| 60 | for job in jobs: | 63 | jobs.remove(taskId) |
| 61 | if job.taskId == taskId: | ||
| 62 | job.jobId.remove() | ||
| 63 | jobs.remove(job) | ||
| 64 | logger.info(f"删除任务成功,任务id:{taskId}") | 64 | logger.info(f"删除任务成功,任务id:{taskId}") |
| 65 | 65 | ||
| 66 | 66 | ||
| 67 | def get_job_info(taskId: int) -> List[ApschedulerJob]: | 67 | def get_job_info(taskId: int = None) -> List[ApschedulerJob]: |
| 68 | if taskId is None: | 68 | if taskId is None: |
| 69 | return jobs | 69 | return jobs.values() |
| 70 | job = Stream(jobs).filter(lambda x: x.taskId == taskId).to_list() | 70 | return [jobs.get(taskId)] if jobs.get(taskId) else [] |
| 71 | return f"job 信息->{job}" | ||
| 72 | 71 | ||
| 73 | 72 | ||
| 74 | async def scrawl_and_save(taskParam: SpiderParams): | 73 | async def scrawl_and_save(taskParam: SpiderParams): |
| 75 | try: | 74 | try: |
| 76 | # 执行爬虫获取结果 给下面一行代码添加 try cache try 捕获异常 | 75 | # 执行爬虫获取结果 给下面一行代码添加 try cache try 捕获异常 |
| 77 | results = await startBrowser(taskParam) | 76 | results = await get_score_data(taskParam) |
| 78 | except Exception as e: | 77 | except Exception as e: |
| 79 | logger.info(f"爬虫重试情况:{startBrowser.statistics}") | ||
| 80 | logger.error(f"爬虫任务执行失败,失败原因:{e}") | 78 | logger.error(f"爬虫任务执行失败,失败原因:{e}") |
| 81 | return | 79 | return |
| 82 | asyncTasks = (save_or_update(item) for item in results) | 80 | asyncTasks = (save_or_update(item) for item in results) | ... | ... |
| ... | @@ -18,18 +18,12 @@ from tenacity import ( | ... | @@ -18,18 +18,12 @@ from tenacity import ( |
| 18 | logger = logging.getLogger(__name__) | 18 | logger = logging.getLogger(__name__) |
| 19 | 19 | ||
| 20 | 20 | ||
| 21 | @retry( | ||
| 22 | stop=stop_after_attempt(3), | ||
| 23 | before_sleep=before_sleep_log(logger, logging.DEBUG), | ||
| 24 | wait=wait_exponential(multiplier=1, min=1, max=10), | ||
| 25 | after=after_log(logger, logging.DEBUG), | ||
| 26 | ) | ||
| 27 | async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: | 21 | async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: |
| 28 | time_range = spiderParam.get_time_range() | 22 | time_range = spiderParam.get_time_range() |
| 29 | url = spiderParam.url | 23 | url = spiderParam.url |
| 30 | scrawl_types = spiderParam.target_type | 24 | scrawl_types = spiderParam.target_type |
| 31 | async with async_playwright() as p: | 25 | async with async_playwright() as p: |
| 32 | browser = await p.firefox.launch(headless=True) | 26 | browser = await p.firefox.launch(headless=False) |
| 33 | # 创建指定个数的浏览器页面 | 27 | # 创建指定个数的浏览器页面 |
| 34 | pages = await tqdm_asyncio.gather( | 28 | pages = await tqdm_asyncio.gather( |
| 35 | *(browser.new_page() for _ in range(len(time_range))) | 29 | *(browser.new_page() for _ in range(len(time_range))) |
| ... | @@ -51,14 +45,14 @@ async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: | ... | @@ -51,14 +45,14 @@ async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: |
| 51 | 45 | ||
| 52 | 46 | ||
| 53 | async def hand_one_data( | 47 | async def hand_one_data( |
| 54 | targetDate: str, page: Page, scrawl_types: List[TypeEnum], url: str | 48 | targetDate: str, page: Page, scrawl_types: List[TypeEnum], url: str |
| 55 | ) -> List[GuoDuoSpiderResult]: | 49 | ) -> List[GuoDuoSpiderResult]: |
| 56 | # 把date这个日期对象解析为年、月、日 | 50 | # 把date这个日期对象解析为年、月、日 |
| 57 | year, month, day = targetDate.split("-") | 51 | year, month, day = targetDate.split("-") |
| 58 | day = day.replace("0", "") | 52 | day = day.replace("0", "") |
| 59 | month = int(month) - 1 | 53 | month = int(month) - 1 |
| 60 | await page.goto(url) | 54 | await page.goto(url) |
| 61 | # 等待遮罩层消失 | 55 | # 点击 |
| 62 | await page.click('//*[@id="currentDateTxt"]') | 56 | await page.click('//*[@id="currentDateTxt"]') |
| 63 | data_css_selector = ( | 57 | data_css_selector = ( |
| 64 | f'#day span.pika-button.pika-day[type="button"]' | 58 | f'#day span.pika-button.pika-day[type="button"]' |
| ... | @@ -70,6 +64,10 @@ async def hand_one_data( | ... | @@ -70,6 +64,10 @@ async def hand_one_data( |
| 70 | # 判断指定元素是否存在如果不存在就返回空的[] | 64 | # 判断指定元素是否存在如果不存在就返回空的[] |
| 71 | if not await doc.is_visible(): | 65 | if not await doc.is_visible(): |
| 72 | return [] | 66 | return [] |
| 67 | # 检查日期是否可点击 | ||
| 68 | if await doc.get_attribute('class') == 'pika-button pika-day is-disabled': | ||
| 69 | logger.warning(f"日期 {targetDate} 被禁用,无法点击") | ||
| 70 | return [] | ||
| 73 | # 点击指定日期 | 71 | # 点击指定日期 |
| 74 | await doc.click() | 72 | await doc.click() |
| 75 | # 最后一步修正定位 | 73 | # 最后一步修正定位 |
| ... | @@ -90,7 +88,7 @@ async def hand_one_data( | ... | @@ -90,7 +88,7 @@ async def hand_one_data( |
| 90 | 88 | ||
| 91 | 89 | ||
| 92 | async def get_data( | 90 | async def get_data( |
| 93 | targetPage: Page, scrawl_type: int, target_time: str | 91 | targetPage: Page, scrawl_type: int, target_time: str |
| 94 | ) -> Optional[List[GuoDuoSpiderResult]]: | 92 | ) -> Optional[List[GuoDuoSpiderResult]]: |
| 95 | target_data_css_selector = ".rank-box" | 93 | target_data_css_selector = ".rank-box" |
| 96 | table = targetPage.locator(target_data_css_selector) | 94 | table = targetPage.locator(target_data_css_selector) |
| ... | @@ -122,12 +120,22 @@ async def get_data( | ... | @@ -122,12 +120,22 @@ async def get_data( |
| 122 | return result | 120 | return result |
| 123 | 121 | ||
| 124 | 122 | ||
| 123 | @retry( | ||
| 124 | stop=stop_after_attempt(3), | ||
| 125 | before_sleep=before_sleep_log(logger, logging.DEBUG), | ||
| 126 | wait=wait_exponential(multiplier=1, min=1, max=10), | ||
| 127 | after=after_log(logger, logging.DEBUG), | ||
| 128 | ) | ||
| 129 | async def retry_job(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: | ||
| 130 | return await startBrowser(spiderParam) | ||
| 131 | |||
| 132 | |||
| 125 | if __name__ == "__main__": | 133 | if __name__ == "__main__": |
| 126 | # 获取程序开始执行的时间 | 134 | # 获取程序开始执行的时间 |
| 127 | start_time = time.time() | 135 | start_time = time.time() |
| 128 | param = SpiderParams( | 136 | param = SpiderParams( |
| 129 | startDate="2024-01-02", | 137 | startDate="2024-12-22", |
| 130 | endDate="2024-01-15", | 138 | endDate="2024-12-23", |
| 131 | target_type=[ | 139 | target_type=[ |
| 132 | TypeEnum.ANIME, | 140 | TypeEnum.ANIME, |
| 133 | TypeEnum.EPISODES, | 141 | TypeEnum.EPISODES, |
| ... | @@ -136,6 +144,6 @@ if __name__ == "__main__": | ... | @@ -136,6 +144,6 @@ if __name__ == "__main__": |
| 136 | ], | 144 | ], |
| 137 | url="http://guduodata.com/", | 145 | url="http://guduodata.com/", |
| 138 | ) | 146 | ) |
| 139 | res = asyncio.run(startBrowser(param)) | 147 | res = asyncio.run(retry_job(param)) |
| 140 | print(f"代码重试情况:{startBrowser.statistics}") | 148 | print(f"代码重试情况:{retry_job.statistics}") |
| 141 | print(f"程序执行耗时时间:{(time.time() - start_time)/1000}长度为:{len(res)}") | 149 | print(f"程序执行耗时时间:{(time.time() - start_time) / 1000}长度为:{len(res)}") | ... | ... |
app/spider/http_spider.py
0 → 100644
| 1 | import asyncio | ||
| 2 | import datetime | ||
| 3 | import time | ||
| 4 | from typing import List, Dict, Tuple | ||
| 5 | import aiohttp | ||
| 6 | from tenacity import retry, stop_after_attempt, before_sleep_log, wait_exponential, after_log | ||
| 7 | import logging | ||
| 8 | from app.schemas.config_schema import UrlTemplateInfo | ||
| 9 | from app.schemas.spider_schema import SpiderParams, TypeEnum, GuoDuoSpiderResult, ResponseModel | ||
| 10 | from app.config.app_config import getAppConfig | ||
| 11 | from superstream import Stream | ||
| 12 | |||
| 13 | logger = logging.getLogger(__name__) | ||
| 14 | |||
| 15 | |||
| 16 | def build_request_url(spiderParam: SpiderParams) -> Dict[TypeEnum, List[Tuple[str, str]]]: | ||
| 17 | base_url = spiderParam.url | ||
| 18 | targets = spiderParam.target_type | ||
| 19 | templates: List[UrlTemplateInfo] = getAppConfig().urlTemplate | ||
| 20 | templates_group: Dict[int, str] = Stream(templates).to_map(lambda x: x.type, lambda x: x.template) | ||
| 21 | # 时间范围中的每个时间 | ||
| 22 | time_range = spiderParam.get_time_range() | ||
| 23 | target_urls: Dict[TypeEnum, List[Tuple[str, str]]] = {} | ||
| 24 | for target in targets: | ||
| 25 | template = templates_group[target.value] | ||
| 26 | url_type = [(base_url + template.format(date=item), item) for item in time_range] | ||
| 27 | target_urls[target] = url_type | ||
| 28 | return target_urls | ||
| 29 | |||
| 30 | |||
| 31 | @retry( | ||
| 32 | stop=stop_after_attempt(3), | ||
| 33 | before_sleep=before_sleep_log(logger, logging.DEBUG), | ||
| 34 | wait=wait_exponential(multiplier=1, min=1, max=10), | ||
| 35 | after=after_log(logger, logging.DEBUG), | ||
| 36 | ) | ||
| 37 | async def fetch(session, url): | ||
| 38 | async with session.get(url) as response: | ||
| 39 | return await response.json() | ||
| 40 | |||
| 41 | |||
| 42 | async def fetch_and_parse(urls: List[Tuple[str, str]]) -> Dict[str, ResponseModel]: | ||
| 43 | async with aiohttp.ClientSession() as session: | ||
| 44 | requests = [fetch(session, url[0]) for url in urls] | ||
| 45 | results = await asyncio.gather(*requests) | ||
| 46 | return_data = {} | ||
| 47 | for k, v in zip(urls, results): | ||
| 48 | return_data[k[1]] = ResponseModel(**v) | ||
| 49 | return return_data | ||
| 50 | |||
| 51 | |||
| 52 | def parse_response_to_spider_result(guduo_responses: Dict[str, ResponseModel], dataType: TypeEnum) -> List[GuoDuoSpiderResult]: | ||
| 53 | results: List[GuoDuoSpiderResult] = [] | ||
| 54 | for k, v in guduo_responses.items(): | ||
| 55 | time_request = k | ||
| 56 | targetType = dataType | ||
| 57 | platform = 'ALL' | ||
| 58 | createTime = datetime.datetime.now() | ||
| 59 | for item in v.data: | ||
| 60 | sourceName = item.name | ||
| 61 | score = item.gdiFloat | ||
| 62 | results.append(GuoDuoSpiderResult(time=time_request, targetType=targetType, | ||
| 63 | platform=platform, sourceName=sourceName, | ||
| 64 | score=score, createTime=createTime)) | ||
| 65 | return results | ||
| 66 | |||
| 67 | |||
| 68 | async def batch_fetch_and_parse(urls: List[Tuple[str, str]], enum: TypeEnum): | ||
| 69 | result = await fetch_and_parse(urls) | ||
| 70 | return parse_response_to_spider_result(result, enum) | ||
| 71 | |||
| 72 | |||
| 73 | async def get_score_data(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]: | ||
| 74 | """ | ||
| 75 | 获取评分数据 | ||
| 76 | :param spiderParam: 爬取参数 | ||
| 77 | :return: | ||
| 78 | """ | ||
| 79 | url_infos = build_request_url(spiderParam) | ||
| 80 | tasks = [batch_fetch_and_parse(urls, key) for key, urls in url_infos.items()] | ||
| 81 | results = await asyncio.gather(*tasks) | ||
| 82 | spider_datas = [item for sublist in results for item in sublist] | ||
| 83 | return spider_datas | ||
| 84 | |||
| 85 | |||
| 86 | if __name__ == '__main__': | ||
| 87 | # 获取程序开始执行的时间 | ||
| 88 | start_time = time.time() | ||
| 89 | param = SpiderParams( | ||
| 90 | startDate="2024-12-22", | ||
| 91 | endDate="2024-12-23", | ||
| 92 | target_type=[ | ||
| 93 | TypeEnum.ANIME, | ||
| 94 | TypeEnum.EPISODES, | ||
| 95 | TypeEnum.MOVIE, | ||
| 96 | TypeEnum.VARIETY, | ||
| 97 | ], | ||
| 98 | url="http://guduodata.com", | ||
| 99 | ) | ||
| 100 | res = asyncio.run(get_score_data(param)) | ||
| 101 | print(f"程序执行耗时时间:{(time.time() - start_time) / 1000}长度为:{len(res)}") | ||
| 102 | print(res) |
-
Please register or sign in to post a comment