Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
文鑫
/
guduo_spider
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
Commit
601ef000
...
601ef0003b1a3ff1326114c11fcd829b7fc894f5
authored
2024-12-25 18:17:39 +0800
by
wenxin
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
update
1 parent
a3f8eab9
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
40 additions
and
51 deletions
app/application.yaml
app/event/fastapi_event.py
app/logs/app.log
app/router/spider_router.py
app/schemas/safe_contrainer.py
app/service/spider_job_service.py
app/spider/guduo_spider.py
app/spider/http_spider.py
app/application.yaml
View file @
601ef00
...
...
@@ -27,7 +27,7 @@ getSpider:
urlTemplate
:
# 1 电视剧 2 电影 3 综艺 4 动漫
-
type
:
1
template
:
"
/m/v3/billboard/list?type=DAILY&category=
ALL_ANIME
&date={date}&attach=gdi&orderTitle=gdi&platformId=0"
template
:
"
/m/v3/billboard/list?type=DAILY&category=
DRAMA
&date={date}&attach=gdi&orderTitle=gdi&platformId=0"
-
type
:
2
template
:
"
/m/v3/billboard/list?type=DAILY&category=NETWORK_MOVIE&date={date}&attach=gdi&orderTitle=gdi&platformId=0"
-
type
:
3
...
...
app/event/fastapi_event.py
View file @
601ef00
...
...
@@ -30,6 +30,22 @@ async def lifespan(app: FastAPI):
logging
.
info
(
"apscheduler启动完成"
)
await
init
()
logging
.
info
(
"初始化数据库完成"
)
# 获取程序开始执行的时间
start_time
=
time
.
time
()
param
=
SpiderParams
(
startDate
=
"2024-12-20"
,
endDate
=
"2024-12-21"
,
target_type
=
[
TypeEnum
.
ANIME
,
TypeEnum
.
EPISODES
,
TypeEnum
.
MOVIE
,
TypeEnum
.
VARIETY
,
],
url
=
"http://guduodata.com"
,
)
print
(
param
.
get_time_range
())
asyncio
.
run
(
scrawl_and_save
(
param
))
print
(
f
"程序执行耗时时间:{(time.time() - start_time) / 1000}"
)
yield
# 在异步上下文管理器中,"退出上下文"时清理机器学习模型,释放资源
scheduler
.
shutdown
()
...
...
app/logs/app.log
View file @
601ef00
This diff could not be displayed because it is too large.
app/router/spider_router.py
View file @
601ef00
# router/users.py
import
logging
from
typing
import
Dict
,
Any
from
fastapi
import
APIRouter
,
Path
from
app.schemas.spider_schema
import
SpiderParams
,
TaskInfo
from
app.service.spider_job_service
import
add_job
,
get_job_info
,
add_guduo_job
from
app.spider
import
guduo_spider
...
...
@@ -35,7 +33,7 @@ async def add_test_job(taskId: int = Path(..., description="要查询的任务
taskInfo
=
TaskInfo
(
taskId
=
taskId
,
taskStart
=
True
,
executeStrategy
=
'0 0/1 * * * ?'
,
url
=
'www.baidu.com'
)
url
=
'www.baidu.com'
,
taskMd5
=
'test'
)
add_job
([
'test'
],
taskInfo
,
test
)
...
...
app/schemas/safe_contrainer.py
View file @
601ef00
...
...
@@ -29,8 +29,6 @@ class SafeDict(Generic[K, V]):
return
self
.
data
.
get
(
key
)
if
__name__
==
'__main__'
:
test
:
SafeDict
[
int
,
str
]
=
SafeDict
()
test
.
put
(
1
,
'1'
)
...
...
app/service/spider_job_service.py
View file @
601ef00
import
asyncio
import
logging
import
time
from
datetime
import
datetime
from
multiprocessing
import
Manager
from
typing
import
List
,
Any
,
Dict
from
app.schemas.safe_contrainer
import
SafeDict
from
typing
import
List
,
Any
from
app.job.job
import
scheduler
from
app.model.mysql_model
import
SpiderModel
from
app.schemas.safe_contrainer
import
SafeDict
from
app.schemas.spider_schema
import
(
ApschedulerJob
,
TaskInfo
,
SpiderParams
,
GuoDuoSpiderResult
,
GuoDuoSpiderResult
,
TypeEnum
,
)
from
app.spider.http_spider
import
get_score_data
...
...
@@ -73,10 +73,12 @@ def get_job_info(taskId: int = None) -> List[ApschedulerJob]:
async
def
scrawl_and_save
(
taskParam
:
SpiderParams
):
try
:
# 执行爬虫获取结果 给下面一行代码添加 try cache try 捕获异常
logger
.
info
(
"开始获取网站数据"
)
results
=
await
get_score_data
(
taskParam
)
except
Exception
as
e
:
logger
.
error
(
f
"爬虫任务执行失败,失败原因:{e}"
)
return
logger
.
info
(
f
"开始保存数据"
)
asyncTasks
=
(
save_or_update
(
item
)
for
item
in
results
)
await
asyncio
.
gather
(
*
asyncTasks
)
logger
.
info
(
f
"爬虫任务执行完成,爬取到数据{len(results)}条 保存到数据库完成"
)
...
...
@@ -108,3 +110,6 @@ async def save_or_update(result: GuoDuoSpiderResult):
updateTime
=
datetime
.
now
(),
)
await
obj
.
save
()
...
...
app/spider/guduo_spider.py
View file @
601ef00
...
...
@@ -18,12 +18,18 @@ from tenacity import (
logger
=
logging
.
getLogger
(
__name__
)
@retry
(
stop
=
stop_after_attempt
(
3
),
before_sleep
=
before_sleep_log
(
logger
,
logging
.
DEBUG
),
wait
=
wait_exponential
(
multiplier
=
1
,
min
=
1
,
max
=
10
),
after
=
after_log
(
logger
,
logging
.
DEBUG
),
)
async
def
startBrowser
(
spiderParam
:
SpiderParams
)
->
List
[
GuoDuoSpiderResult
]:
time_range
=
spiderParam
.
get_time_range
()
url
=
spiderParam
.
url
scrawl_types
=
spiderParam
.
target_type
async
with
async_playwright
()
as
p
:
browser
=
await
p
.
firefox
.
launch
(
headless
=
Fals
e
)
browser
=
await
p
.
firefox
.
launch
(
headless
=
Tru
e
)
# 创建指定个数的浏览器页面
pages
=
await
tqdm_asyncio
.
gather
(
*
(
browser
.
new_page
()
for
_
in
range
(
len
(
time_range
)))
...
...
@@ -52,7 +58,7 @@ async def hand_one_data(
day
=
day
.
replace
(
"0"
,
""
)
month
=
int
(
month
)
-
1
await
page
.
goto
(
url
)
#
点击
#
等待遮罩层消失
await
page
.
click
(
'//*[@id="currentDateTxt"]'
)
data_css_selector
=
(
f
'#day span.pika-button.pika-day[type="button"]'
...
...
@@ -64,10 +70,6 @@ async def hand_one_data(
# 判断指定元素是否存在如果不存在就返回空的[]
if
not
await
doc
.
is_visible
():
return
[]
# 检查日期是否可点击
if
await
doc
.
get_attribute
(
'class'
)
==
'pika-button pika-day is-disabled'
:
logger
.
warning
(
f
"日期 {targetDate} 被禁用,无法点击"
)
return
[]
# 点击指定日期
await
doc
.
click
()
# 最后一步修正定位
...
...
@@ -120,22 +122,12 @@ async def get_data(
return
result
@retry
(
stop
=
stop_after_attempt
(
3
),
before_sleep
=
before_sleep_log
(
logger
,
logging
.
DEBUG
),
wait
=
wait_exponential
(
multiplier
=
1
,
min
=
1
,
max
=
10
),
after
=
after_log
(
logger
,
logging
.
DEBUG
),
)
async
def
retry_job
(
spiderParam
:
SpiderParams
)
->
List
[
GuoDuoSpiderResult
]:
return
await
startBrowser
(
spiderParam
)
if
__name__
==
"__main__"
:
# 获取程序开始执行的时间
start_time
=
time
.
time
()
param
=
SpiderParams
(
startDate
=
"2024-
12-2
2"
,
endDate
=
"2024-
12-23
"
,
startDate
=
"2024-
01-0
2"
,
endDate
=
"2024-
01-15
"
,
target_type
=
[
TypeEnum
.
ANIME
,
TypeEnum
.
EPISODES
,
...
...
@@ -144,6 +136,6 @@ if __name__ == "__main__":
],
url
=
"http://guduodata.com/"
,
)
res
=
asyncio
.
run
(
retry_job
(
param
))
print
(
f
"代码重试情况:{
retry_job
.statistics}"
)
print
(
f
"程序执行耗时时间:{(time.time() - start_time)
/
1000}长度为:{len(res)}"
)
res
=
asyncio
.
run
(
startBrowser
(
param
))
print
(
f
"代码重试情况:{
startBrowser
.statistics}"
)
print
(
f
"程序执行耗时时间:{(time.time() - start_time)
/
1000}长度为:{len(res)}"
)
...
...
app/spider/http_spider.py
View file @
601ef00
...
...
@@ -88,23 +88,3 @@ async def get_score_data(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]:
spider_datas
=
[
item
for
sublist
in
results
for
item
in
sublist
]
logger
.
info
(
f
"爬取数据结束,共爬取{len(spider_datas)}条数据"
)
return
spider_datas
if
__name__
==
'__main__'
:
# 获取程序开始执行的时间
start_time
=
time
.
time
()
param
=
SpiderParams
(
startDate
=
"2024-12-22"
,
endDate
=
"2024-12-23"
,
target_type
=
[
TypeEnum
.
ANIME
,
TypeEnum
.
EPISODES
,
TypeEnum
.
MOVIE
,
TypeEnum
.
VARIETY
,
],
url
=
"http://guduodata.com"
,
)
print
(
param
.
get_time_range
())
res
=
asyncio
.
run
(
get_score_data
(
param
))
print
(
f
"程序执行耗时时间:{(time.time() - start_time) / 1000}长度为:{len(res)}"
)
print
(
res
)
...
...
Please
register
or
sign in
to post a comment