Unverified Commit 019f4c03 authored by Yen Chi Hsuan's avatar Yen Chi Hsuan
Browse files

[bandcamp] Fix extraction for incomplete albums

Closes #11727
parent 2ab2c0d1
version <unreleased>
Extractors
* [bandcamp] Fix extraction for incomplete albums (#11727)
* [iwara] Fix extraction (#11781)
* [googledrive] Fix extraction on Python 3.6
......
......@@ -209,6 +209,15 @@ class BandcampAlbumIE(InfoExtractor):
'id': 'entropy-ep',
},
'playlist_mincount': 3,
}, {
# not all tracks have songs
'url': 'https://insulters.bandcamp.com/album/we-are-the-plague',
'info_dict': {
'id': 'we-are-the-plague',
'title': 'WE ARE THE PLAGUE',
'uploader_id': 'insulters',
},
'playlist_count': 2,
}]
def _real_extract(self, url):
......@@ -217,12 +226,16 @@ class BandcampAlbumIE(InfoExtractor):
album_id = mobj.group('album_id')
playlist_id = album_id or uploader_id
webpage = self._download_webpage(url, playlist_id)
tracks_paths = re.findall(r'<a href="(.*?)" itemprop="url">', webpage)
if not tracks_paths:
track_elements = re.findall(
r'(?s)<div[^>]*>(.*?<a[^>]+href="([^"]+?)"[^>]+itemprop="url"[^>]*>.*?)</div>', webpage)
if not track_elements:
raise ExtractorError('The page doesn\'t contain any tracks')
# Only tracks with duration info have songs
entries = [
self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key())
for t_path in tracks_paths]
for elem_content, t_path in track_elements
if self._html_search_meta('duration', elem_content, default=None)]
title = self._html_search_regex(
r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"',
webpage, 'title', fatal=False)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment