Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
Backups
youtube-dl
Commits
351f37c0
Unverified
Commit
351f37c0
authored
Aug 13, 2019
by
Sergey M․
Browse files
[youtube:playlist] Improve flat extraction (closes #21927)
parent
3bce4ff7
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
43 additions
and
6 deletions
+43
-6
youtube_dl/extractor/youtube.py
youtube_dl/extractor/youtube.py
+43
-6
No files found.
youtube_dl/extractor/youtube.py
View file @
351f37c0
...
...
@@ -31,6 +31,7 @@ from ..utils import (
clean_html
,
dict_get
,
error_to_compat_str
,
extract_attributes
,
ExtractorError
,
float_or_none
,
get_element_by_attribute
,
...
...
@@ -324,17 +325,18 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
for
video_id
,
video_title
in
self
.
extract_videos_from_page
(
content
):
yield
self
.
url_result
(
video_id
,
'Youtube'
,
video_id
,
video_title
)
def
extract_videos_from_page
(
self
,
page
):
ids_in_page
=
[]
titles_in_page
=
[]
for
mobj
in
re
.
finditer
(
self
.
_VIDEO_RE
,
page
):
def
extract_videos_from_page_impl
(
self
,
video_re
,
page
,
ids_in_page
,
titles_in_page
):
for
mobj
in
re
.
finditer
(
video_re
,
page
):
# The link with index 0 is not the first video of the playlist (not sure if still actual)
if
'index'
in
mobj
.
groupdict
()
and
mobj
.
group
(
'id'
)
==
'0'
:
continue
video_id
=
mobj
.
group
(
'id'
)
video_title
=
unescapeHTML
(
mobj
.
group
(
'title'
))
video_title
=
unescapeHTML
(
mobj
.
group
(
'title'
))
if
'title'
in
mobj
.
groupdict
()
else
None
if
video_title
:
video_title
=
video_title
.
strip
()
if
video_title
==
'► Play all'
:
video_title
=
None
try
:
idx
=
ids_in_page
.
index
(
video_id
)
if
video_title
and
not
titles_in_page
[
idx
]:
...
...
@@ -342,6 +344,12 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
except
ValueError
:
ids_in_page
.
append
(
video_id
)
titles_in_page
.
append
(
video_title
)
def
extract_videos_from_page
(
self
,
page
):
ids_in_page
=
[]
titles_in_page
=
[]
self
.
extract_videos_from_page_impl
(
self
.
_VIDEO_RE
,
page
,
ids_in_page
,
titles_in_page
)
return
zip
(
ids_in_page
,
titles_in_page
)
...
...
@@ -2438,7 +2446,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
(%(playlist_id)s)
)"""
%
{
'playlist_id'
:
YoutubeBaseInfoExtractor
.
_PLAYLIST_ID_RE
}
_TEMPLATE_URL
=
'https://www.youtube.com/playlist?list=%s'
_VIDEO_RE
=
r
'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:&(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?'
_VIDEO_RE_TPL
=
r
'href="\s*/watch\?v=%s(?:&(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?'
_VIDEO_RE
=
_VIDEO_RE_TPL
%
r
'(?P<id>[0-9A-Za-z_-]{11})'
IE_NAME
=
'youtube:playlist'
_TESTS
=
[{
'url'
:
'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re'
,
...
...
@@ -2603,6 +2612,34 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
def
_real_initialize
(
self
):
self
.
_login
()
def
extract_videos_from_page
(
self
,
page
):
ids_in_page
=
[]
titles_in_page
=
[]
for
item
in
re
.
findall
(
r
'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)'
,
page
):
attrs
=
extract_attributes
(
item
)
video_id
=
attrs
[
'data-video-id'
]
video_title
=
unescapeHTML
(
attrs
.
get
(
'data-title'
))
if
video_title
:
video_title
=
video_title
.
strip
()
ids_in_page
.
append
(
video_id
)
titles_in_page
.
append
(
video_title
)
# Fallback with old _VIDEO_RE
self
.
extract_videos_from_page_impl
(
self
.
_VIDEO_RE
,
page
,
ids_in_page
,
titles_in_page
)
# Relaxed fallbacks
self
.
extract_videos_from_page_impl
(
r
'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})'
,
page
,
ids_in_page
,
titles_in_page
)
self
.
extract_videos_from_page_impl
(
r
'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})'
,
page
,
ids_in_page
,
titles_in_page
)
return
zip
(
ids_in_page
,
titles_in_page
)
def
_extract_mix
(
self
,
playlist_id
):
# The mixes are generated from a single video
# the id of the playlist is just 'RD' + video_id
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment