Commit 601ef000 601ef0003b1a3ff1326114c11fcd829b7fc894f5 by wenxin

update

1 parent a3f8eab9
......@@ -27,7 +27,7 @@ getSpider:
urlTemplate:
# 1 电视剧 2 电影 3 综艺 4 动漫
- type: 1
template: "/m/v3/billboard/list?type=DAILY&category=ALL_ANIME&date={date}&attach=gdi&orderTitle=gdi&platformId=0"
template: "/m/v3/billboard/list?type=DAILY&category=DRAMA&date={date}&attach=gdi&orderTitle=gdi&platformId=0"
- type: 2
template: "/m/v3/billboard/list?type=DAILY&category=NETWORK_MOVIE&date={date}&attach=gdi&orderTitle=gdi&platformId=0"
- type: 3
......
......@@ -30,6 +30,22 @@ async def lifespan(app: FastAPI):
logging.info("apscheduler启动完成")
await init()
logging.info("初始化数据库完成")
# 获取程序开始执行的时间
start_time = time.time()
param = SpiderParams(
startDate="2024-12-20",
endDate="2024-12-21",
target_type=[
TypeEnum.ANIME,
TypeEnum.EPISODES,
TypeEnum.MOVIE,
TypeEnum.VARIETY,
],
url="http://guduodata.com",
)
print(param.get_time_range())
asyncio.run(scrawl_and_save(param))
print(f"程序执行耗时时间:{(time.time() - start_time) / 1000}")
yield
# 在异步上下文管理器中,"退出上下文"时清理机器学习模型,释放资源
scheduler.shutdown()
......
This diff could not be displayed because it is too large.
# router/users.py
import logging
from typing import Dict, Any
from fastapi import APIRouter, Path
from app.schemas.spider_schema import SpiderParams, TaskInfo
from app.service.spider_job_service import add_job, get_job_info, add_guduo_job
from app.spider import guduo_spider
......@@ -35,7 +33,7 @@ async def add_test_job(taskId: int = Path(..., description="要查询的任务
taskInfo = TaskInfo(taskId=taskId, taskStart=True,
executeStrategy='0 0/1 * * * ?',
url='www.baidu.com')
url='www.baidu.com', taskMd5='test')
add_job(['test'], taskInfo, test)
......
......@@ -29,8 +29,6 @@ class SafeDict(Generic[K, V]):
return self.data.get(key)
if __name__ == '__main__':
test: SafeDict[int, str] = SafeDict()
test.put(1, '1')
......
import asyncio
import logging
import time
from datetime import datetime
from multiprocessing import Manager
from typing import List, Any, Dict
from app.schemas.safe_contrainer import SafeDict
from typing import List, Any
from app.job.job import scheduler
from app.model.mysql_model import SpiderModel
from app.schemas.safe_contrainer import SafeDict
from app.schemas.spider_schema import (
ApschedulerJob,
TaskInfo,
SpiderParams,
GuoDuoSpiderResult,
GuoDuoSpiderResult, TypeEnum,
)
from app.spider.http_spider import get_score_data
......@@ -73,10 +73,12 @@ def get_job_info(taskId: int = None) -> List[ApschedulerJob]:
async def scrawl_and_save(taskParam: SpiderParams):
try:
# 执行爬虫获取结果 给下面一行代码添加 try cache try 捕获异常
logger.info("开始获取网站数据")
results = await get_score_data(taskParam)
except Exception as e:
logger.error(f"爬虫任务执行失败,失败原因:{e}")
return
logger.info(f"开始保存数据")
asyncTasks = (save_or_update(item) for item in results)
await asyncio.gather(*asyncTasks)
logger.info(f"爬虫任务执行完成,爬取到数据{len(results)}条 保存到数据库完成")
......@@ -108,3 +110,6 @@ async def save_or_update(result: GuoDuoSpiderResult):
updateTime=datetime.now(),
)
await obj.save()
......
......@@ -18,12 +18,18 @@ from tenacity import (
logger = logging.getLogger(__name__)
@retry(
stop=stop_after_attempt(3),
before_sleep=before_sleep_log(logger, logging.DEBUG),
wait=wait_exponential(multiplier=1, min=1, max=10),
after=after_log(logger, logging.DEBUG),
)
async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]:
time_range = spiderParam.get_time_range()
url = spiderParam.url
scrawl_types = spiderParam.target_type
async with async_playwright() as p:
browser = await p.firefox.launch(headless=False)
browser = await p.firefox.launch(headless=True)
# 创建指定个数的浏览器页面
pages = await tqdm_asyncio.gather(
*(browser.new_page() for _ in range(len(time_range)))
......@@ -52,7 +58,7 @@ async def hand_one_data(
day = day.replace("0", "")
month = int(month) - 1
await page.goto(url)
# 点击
# 等待遮罩层消失
await page.click('//*[@id="currentDateTxt"]')
data_css_selector = (
f'#day span.pika-button.pika-day[type="button"]'
......@@ -64,10 +70,6 @@ async def hand_one_data(
# 判断指定元素是否存在如果不存在就返回空的[]
if not await doc.is_visible():
return []
# 检查日期是否可点击
if await doc.get_attribute('class') == 'pika-button pika-day is-disabled':
logger.warning(f"日期 {targetDate} 被禁用,无法点击")
return []
# 点击指定日期
await doc.click()
# 最后一步修正定位
......@@ -120,22 +122,12 @@ async def get_data(
return result
@retry(
stop=stop_after_attempt(3),
before_sleep=before_sleep_log(logger, logging.DEBUG),
wait=wait_exponential(multiplier=1, min=1, max=10),
after=after_log(logger, logging.DEBUG),
)
async def retry_job(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]:
return await startBrowser(spiderParam)
if __name__ == "__main__":
# 获取程序开始执行的时间
start_time = time.time()
param = SpiderParams(
startDate="2024-12-22",
endDate="2024-12-23",
startDate="2024-01-02",
endDate="2024-01-15",
target_type=[
TypeEnum.ANIME,
TypeEnum.EPISODES,
......@@ -144,6 +136,6 @@ if __name__ == "__main__":
],
url="http://guduodata.com/",
)
res = asyncio.run(retry_job(param))
print(f"代码重试情况:{retry_job.statistics}")
print(f"程序执行耗时时间:{(time.time() - start_time) / 1000}长度为:{len(res)}")
res = asyncio.run(startBrowser(param))
print(f"代码重试情况:{startBrowser.statistics}")
print(f"程序执行耗时时间:{(time.time() - start_time)/1000}长度为:{len(res)}")
......
......@@ -88,23 +88,3 @@ async def get_score_data(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]:
spider_datas = [item for sublist in results for item in sublist]
logger.info(f"爬取数据结束,共爬取{len(spider_datas)}条数据")
return spider_datas
if __name__ == '__main__':
# 获取程序开始执行的时间
start_time = time.time()
param = SpiderParams(
startDate="2024-12-22",
endDate="2024-12-23",
target_type=[
TypeEnum.ANIME,
TypeEnum.EPISODES,
TypeEnum.MOVIE,
TypeEnum.VARIETY,
],
url="http://guduodata.com",
)
print(param.get_time_range())
res = asyncio.run(get_score_data(param))
print(f"程序执行耗时时间:{(time.time() - start_time) / 1000}长度为:{len(res)}")
print(res)
......