Unverified Commit 476cf548 authored by Sergey M․'s avatar Sergey M․
Browse files

[sportbox] Improve extraction, add support for matchtv.ru and fix video id (closes #17978)

parent bebef109
......@@ -1043,7 +1043,7 @@ from .spike import (
)
from .stitcher import StitcherIE
from .sport5 import Sport5IE
from .sportbox import SportBoxEmbedIE
from .sportbox import SportBoxIE
from .sportdeutschland import SportDeutschlandIE
from .springboardplatform import SpringboardPlatformIE
from .sprout import SproutIE
......
......@@ -47,7 +47,7 @@ from .nbc import NBCSportsVPlayerIE
from .ooyala import OoyalaIE
from .rutv import RUTVIE
from .tvc import TVCIE
from .sportbox import SportBoxEmbedIE
from .sportbox import SportBoxIE
from .smotri import SmotriIE
from .myvi import MyviIE
from .condenast import CondeNastIE
......@@ -2636,9 +2636,9 @@ class GenericIE(InfoExtractor):
return self.url_result(tvc_url, 'TVC')
# Look for embedded SportBox player
sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)
sportbox_urls = SportBoxIE._extract_urls(webpage)
if sportbox_urls:
return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie='SportBoxEmbed')
return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie=SportBoxIE.ie_key())
# Look for embedded XHamster player
xhamster_urls = XHamsterEmbedIE._extract_urls(webpage)
......
......@@ -8,20 +8,24 @@ from ..utils import (
determine_ext,
int_or_none,
js_to_json,
merge_dicts,
)
class SportBoxEmbedIE(InfoExtractor):
_VALID_URL = r'https?://news\.sportbox\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)'
class SportBoxIE(InfoExtractor):
_VALID_URL = r'https?://(?:news\.sportbox|matchtv)\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)'
_TESTS = [{
'url': 'http://news.sportbox.ru/vdl/player/ci/211355',
'info_dict': {
'id': '211355',
'id': '109158',
'ext': 'mp4',
'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»',
'description': 'В Новороссийске прошел детский турнир «Поле славы боевой»',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 292,
'view_count': int,
'timestamp': 1426237001,
'upload_date': '20150313',
},
'params': {
# m3u8 download
......@@ -33,12 +37,18 @@ class SportBoxEmbedIE(InfoExtractor):
}, {
'url': 'https://news.sportbox.ru/vdl/player/media/193095',
'only_matching': True,
}, {
'url': 'https://news.sportbox.ru/vdl/player/media/109158',
'only_matching': True,
}, {
'url': 'https://matchtv.ru/vdl/player/media/109158',
'only_matching': True,
}]
@staticmethod
def _extract_urls(webpage):
return re.findall(
r'<iframe[^>]+src="(https?://news\.sportbox\.ru/vdl/player[^"]+)"',
r'<iframe[^>]+src="(https?://(?:news\.sportbox|matchtv)\.ru/vdl/player[^"]+)"',
webpage)
def _real_extract(self, url):
......@@ -46,22 +56,14 @@ class SportBoxEmbedIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
wjplayer_data = self._parse_json(
self._search_regex(
r'(?s)var\s+playerOptions\s*=\s*({.+?});', webpage, 'wjplayer settings'),
video_id, transform_source=js_to_json)
wjplayer_data['sources'] = self._parse_json(
sources = self._parse_json(
self._search_regex(
r'(?s)playerOptions\.sources\s*=\s*(\[.+?\]);', webpage, 'wjplayer sources'),
r'(?s)playerOptions\.sources(?:WithRes)?\s*=\s*(\[.+?\])\s*;\s*\n',
webpage, 'sources'),
video_id, transform_source=js_to_json)
title = self._html_search_meta(
['og:title', 'twitter:title'], webpage) or self._html_search_regex(
r'<title>(.+?)</title>', webpage, 'title', fatal=False) or video_id
formats = []
for source in wjplayer_data['sources']:
for source in sources:
src = source.get('src')
if not src:
continue
......@@ -75,14 +77,23 @@ class SportBoxEmbedIE(InfoExtractor):
})
self._sort_formats(formats)
player = self._parse_json(
self._search_regex(
r'(?s)playerOptions\s*=\s*({.+?})\s*;\s*\n', webpage,
'player options', default='{}'),
video_id, transform_source=js_to_json)
media_id = player['mediaId']
info = self._search_json_ld(webpage, media_id, default={})
view_count = int_or_none(self._search_regex(
r'Просмотров\s*:\s*(\d+)', webpage, 'view count', default=None))
return {
'id': video_id,
'title': title,
'thumbnail': wjplayer_data.get('poster'),
'duration': int_or_none(wjplayer_data.get('duration')),
return merge_dicts(info, {
'id': media_id,
'title': self._og_search_title(webpage, default=None) or media_id,
'thumbnail': player.get('poster'),
'duration': int_or_none(player.get('duration')),
'view_count': view_count,
'formats': formats,
}
})
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment