Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
文鑫
/
guduo_spider
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
Commit
c022d21e
...
c022d21ee0405b8f298fd98d7fcaa035384aa2b8
authored
2024-12-22 00:30:57 +0800
by
文鑫
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
添加tqdm
1 parent
828ad47e
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
6 additions
and
31 deletions
app/spider/guduo_spider.py
requirements.txt
app/spider/guduo_spider.py
View file @
c022d21
...
...
@@ -6,6 +6,7 @@ from playwright.async_api import Page, async_playwright
from
superstream
import
Stream
from
app.schemas.spider_schema
import
GuoDuoSpiderResult
,
SpiderParams
,
TypeEnum
import
logging
from
tqdm.asyncio
import
tqdm_asyncio
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -17,10 +18,10 @@ async def startBrowser(spiderParam: SpiderParams) -> List[GuoDuoSpiderResult]:
async
with
async_playwright
()
as
p
:
browser
=
await
p
.
firefox
.
launch
(
headless
=
True
)
# 创建指定个数的浏览器页面
pages
=
await
asyncio
.
gather
(
*
(
browser
.
new_page
()
pages
=
await
tqdm_
asyncio
.
gather
(
*
(
browser
.
new_page
()
for
_
in
range
(
len
(
time_range
))))
# 同时循环time_range与pages 去调用hand_one_data异步方法
results
=
await
asyncio
.
gather
(
*
(
hand_one_data
(
time_range
[
i
],
pages
[
i
],
results
=
await
tqdm_
asyncio
.
gather
(
*
(
hand_one_data
(
time_range
[
i
],
pages
[
i
],
scrawl_types
,
url
)
for
i
in
range
(
len
(
time_range
))))
return
(
Stream
(
results
)
...
...
@@ -94,11 +95,11 @@ async def get_data(targetPage: Page, scrawl_type: int, target_time: str) -> Opti
if
__name__
==
'__main__'
:
# 获取程序开始执行的时间
start_time
=
time
.
time
()
param
=
SpiderParams
(
startDate
=
'
1991
-01-02'
,
endDate
=
'
1991-01-0
5'
,
param
=
SpiderParams
(
startDate
=
'
2024
-01-02'
,
endDate
=
'
2024-01-1
5'
,
target_type
=
[
TypeEnum
.
ANIME
,
TypeEnum
.
EPISODES
,
TypeEnum
.
MOVIE
]
,
url
=
'http://guduodata.com/'
)
res
=
asyncio
.
run
(
startBrowser
(
param
))
print
(
f
'程序执行耗时时间:{
time.time() - start_time}
长度为:{len(res)}'
)
print
(
f
'程序执行耗时时间:{
(time.time() - start_time)/1000}
长度为:{len(res)}'
)
for
it
in
res
:
print
(
it
)
...
...
requirements.txt
deleted
100644 → 0
View file @
828ad47
aiosqlite==0.20.0
annotated-types==0.7.0
anyio==4.7.0
APScheduler==3.11.0
asyncmy==0.2.10
click==8.1.7
exceptiongroup==1.2.2
fastapi==0.115.6
greenlet==3.1.1
h11==0.14.0
idna==3.10
iso8601==2.1.0
playwright==1.49.1
pydantic==2.10.4
pydantic_core==2.27.2
pyee==12.0.0
pypika-tortoise==0.3.2
pytz==2024.2
PyYAML==6.0.2
sniffio==1.3.1
starlette==0.41.3
SuperStream==0.2.6
tortoise-orm==0.22.2
typing_extensions==4.12.2
tzlocal==5.2
uvicorn==0.34.0
Please
register
or
sign in
to post a comment