Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
[vice] improve extraction(closes #23631)
  • Loading branch information
Remita Amine committed Jan 5, 2020
1 parent 484637a commit 44b434e
Showing 1 changed file with 106 additions and 106 deletions.
212 changes: 106 additions & 106 deletions youtube_dl/extractor/vice.py
@@ -1,35 +1,50 @@
# coding: utf-8
from __future__ import unicode_literals

import re
import time
import functools
import hashlib
import json
import random
import re
import time

from .adobepass import AdobePassIE
from .youtube import YoutubeIE
from .common import InfoExtractor
from .youtube import YoutubeIE
from ..compat import (
compat_HTTPError,
compat_str,
)
from ..utils import (
clean_html,
ExtractorError,
int_or_none,
OnDemandPagedList,
parse_age_limit,
str_or_none,
try_get,
)


class ViceIE(AdobePassIE):
class ViceBaseIE(InfoExtractor):
def _call_api(self, resource, resource_key, resource_id, locale, fields, args=''):
return self._download_json(
'https://video.vice.com/api/v1/graphql', resource_id, query={
'query': '''{
%s(locale: "%s", %s: "%s"%s) {
%s
}
}''' % (resource, locale, resource_key, resource_id, args, fields),
})['data'][resource]


class ViceIE(ViceBaseIE, AdobePassIE):
IE_NAME = 'vice'
_VALID_URL = r'https?://(?:(?:video|vms)\.vice|(?:www\.)?viceland)\.com/(?P<locale>[^/]+)/(?:video/[^/]+|embed)/(?P<id>[\da-f]+)'
_VALID_URL = r'https?://(?:(?:video|vms)\.vice|(?:www\.)?vice(?:land|tv))\.com/(?P<locale>[^/]+)/(?:video/[^/]+|embed)/(?P<id>[\da-f]{24})'
_TESTS = [{
'url': 'https://video.vice.com/en_us/video/pet-cremator/58c69e38a55424f1227dc3f7',
'info_dict': {
'id': '5e647f0125e145c9aef2069412c0cbde',
'id': '58c69e38a55424f1227dc3f7',
'ext': 'mp4',
'title': '10 Questions You Always Wanted To Ask: Pet Cremator',
'description': 'md5:fe856caacf61fe0e74fab15ce2b07ca5',
Expand All @@ -43,33 +58,31 @@ class ViceIE(AdobePassIE):
# m3u8 download
'skip_download': True,
},
'add_ie': ['UplynkPreplay'],
}, {
# geo restricted to US
'url': 'https://video.vice.com/en_us/video/the-signal-from-tolva/5816510690b70e6c5fd39a56',
'info_dict': {
'id': '930c0ad1f47141cc955087eecaddb0e2',
'id': '5816510690b70e6c5fd39a56',
'ext': 'mp4',
'uploader': 'waypoint',
'uploader': 'vice',
'title': 'The Signal From Tölva',
'description': 'md5:3927e3c79f9e8094606a2b3c5b5e55d5',
'uploader_id': '57f7d621e05ca860fa9ccaf9',
'uploader_id': '57a204088cb727dec794c67b',
'timestamp': 1477941983,
'upload_date': '20161031',
},
'params': {
# m3u8 download
'skip_download': True,
},
'add_ie': ['UplynkPreplay'],
}, {
'url': 'https://video.vice.com/alps/video/ulfs-wien-beruchtigste-grafitti-crew-part-1/581b12b60a0e1f4c0fb6ea2f',
'info_dict': {
'id': '581b12b60a0e1f4c0fb6ea2f',
'ext': 'mp4',
'title': 'ULFs - Wien berüchtigste Grafitti Crew - Part 1',
'description': '<p>Zwischen Hinterzimmer-Tattoos und U-Bahnschächten erzählen uns die Ulfs, wie es ist, "süchtig nach Sachbeschädigung" zu sein.</p>',
'uploader': 'VICE',
'description': 'Zwischen Hinterzimmer-Tattoos und U-Bahnschächten erzählen uns die Ulfs, wie es ist, "süchtig nach Sachbeschädigung" zu sein.',
'uploader': 'vice',
'uploader_id': '57a204088cb727dec794c67b',
'timestamp': 1485368119,
'upload_date': '20170125',
Expand All @@ -78,9 +91,7 @@ class ViceIE(AdobePassIE):
'params': {
# AES-encrypted m3u8
'skip_download': True,
'proxy': '127.0.0.1:8118',
},
'add_ie': ['UplynkPreplay'],
}, {
'url': 'https://video.vice.com/en_us/video/pizza-show-trailer/56d8c9a54d286ed92f7f30e4',
'only_matching': True,
Expand All @@ -98,7 +109,7 @@ class ViceIE(AdobePassIE):
@staticmethod
def _extract_urls(webpage):
return re.findall(
r'<iframe\b[^>]+\bsrc=["\']((?:https?:)?//video\.vice\.com/[^/]+/embed/[\da-f]+)',
r'<iframe\b[^>]+\bsrc=["\']((?:https?:)?//video\.vice\.com/[^/]+/embed/[\da-f]{24})',
webpage)

@staticmethod
Expand All @@ -109,31 +120,16 @@ def _extract_url(webpage):
def _real_extract(self, url):
locale, video_id = re.match(self._VALID_URL, url).groups()

webpage = self._download_webpage(
'https://video.vice.com/%s/embed/%s' % (locale, video_id),
video_id)

video = self._parse_json(
self._search_regex(
r'PREFETCH_DATA\s*=\s*({.+?})\s*;\s*\n', webpage,
'app state'), video_id)['video']
video_id = video.get('vms_id') or video.get('id') or video_id
title = video['title']
is_locked = video.get('locked')
video = self._call_api('videos', 'id', video_id, locale, '''body
locked
rating
thumbnail_url
title''')[0]
title = video['title'].strip()
rating = video.get('rating')
thumbnail = video.get('thumbnail_url')
duration = int_or_none(video.get('duration'))
series = try_get(
video, lambda x: x['episode']['season']['show']['title'],
compat_str)
episode_number = try_get(
video, lambda x: x['episode']['episode_number'])
season_number = try_get(
video, lambda x: x['episode']['season']['season_number'])
uploader = None

query = {}
if is_locked:
if video.get('locked'):
resource = self._get_mvpd_resource(
'VICELAND', title, video_id, rating)
query['tvetoken'] = self._extract_mvpd_auth(
Expand All @@ -148,12 +144,9 @@ def _real_extract(self, url):
query.update({
'exp': exp,
'sign': hashlib.sha512(('%s:GET:%d' % (video_id, exp)).encode()).hexdigest(),
'_ad_blocked': None,
'_ad_unit': '',
'_debug': '',
'skipadstitching': 1,
'platform': 'desktop',
'rn': random.randint(10000, 100000),
'fbprebidtoken': '',
})

try:
Expand All @@ -169,85 +162,94 @@ def _real_extract(self, url):
raise

video_data = preplay['video']
base = video_data['base']
uplynk_preplay_url = preplay['preplayURL']
episode = video_data.get('episode', {})
channel = video_data.get('channel', {})
formats = self._extract_m3u8_formats(
preplay['playURL'], video_id, 'mp4', 'm3u8_native')
self._sort_formats(formats)
episode = video_data.get('episode') or {}
channel = video_data.get('channel') or {}
season = video_data.get('season') or {}

subtitles = {}
cc_url = preplay.get('ccURL')
if cc_url:
subtitles['en'] = [{
for subtitle in preplay.get('subtitleURLs', []):
cc_url = subtitle.get('url')
if not cc_url:
continue
language_code = try_get(subtitle, lambda x: x['languages'][0]['language_code'], compat_str) or 'en'
subtitles.setdefault(language_code, []).append({
'url': cc_url,
}]
})

return {
'_type': 'url_transparent',
'url': uplynk_preplay_url,
'formats': formats,
'id': video_id,
'title': title,
'description': base.get('body') or base.get('display_body'),
'thumbnail': thumbnail,
'duration': int_or_none(video_data.get('video_duration')) or duration,
'description': clean_html(video.get('body')),
'thumbnail': video.get('thumbnail_url'),
'duration': int_or_none(video_data.get('video_duration')),
'timestamp': int_or_none(video_data.get('created_at'), 1000),
'age_limit': parse_age_limit(video_data.get('video_rating')),
'series': video_data.get('show_title') or series,
'episode_number': int_or_none(episode.get('episode_number') or episode_number),
'age_limit': parse_age_limit(video_data.get('video_rating') or rating),
'series': try_get(video_data, lambda x: x['show']['base']['display_title'], compat_str),
'episode_number': int_or_none(episode.get('episode_number')),
'episode_id': str_or_none(episode.get('id') or video_data.get('episode_id')),
'season_number': int_or_none(season_number),
'season_id': str_or_none(episode.get('season_id')),
'uploader': channel.get('base', {}).get('title') or channel.get('name') or uploader,
'season_number': int_or_none(season.get('season_number')),
'season_id': str_or_none(season.get('id') or video_data.get('season_id')),
'uploader': channel.get('name'),
'uploader_id': str_or_none(channel.get('id')),
'subtitles': subtitles,
'ie_key': 'UplynkPreplay',
}


class ViceShowIE(InfoExtractor):
class ViceShowIE(ViceBaseIE):
IE_NAME = 'vice:show'
_VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?show/(?P<id>[^/?#&]+)'

_TEST = {
'url': 'https://munchies.vice.com/en/show/fuck-thats-delicious-2',
_VALID_URL = r'https?://(?:video\.vice|(?:www\.)?vice(?:land|tv))\.com/(?P<locale>[^/]+)/show/(?P<id>[^/?#&]+)'
_PAGE_SIZE = 25
_TESTS = [{
'url': 'https://video.vice.com/en_us/show/fck-thats-delicious',
'info_dict': {
'id': 'fuck-thats-delicious-2',
'title': "Fuck, That's Delicious",
'description': 'Follow the culinary adventures of rapper Action Bronson during his ongoing world tour.',
'id': '57a2040c8cb727dec794c901',
'title': 'F*ck, Thats Delicious',
'description': 'The life and eating habits of rap’s greatest bon vivant, Action Bronson.',
},
'playlist_count': 17,
}
'playlist_mincount': 64,
}, {
'url': 'https://www.vicetv.com/en_us/show/fck-thats-delicious',
'only_matching': True,
}]

def _real_extract(self, url):
show_id = self._match_id(url)
webpage = self._download_webpage(url, show_id)
def _fetch_page(self, locale, show_id, page):
videos = self._call_api('videos', 'show_id', show_id, locale, '''body
id
url''', ', page: %d, per_page: %d' % (page + 1, self._PAGE_SIZE))
for video in videos:
yield self.url_result(
video['url'], ViceIE.ie_key(), video.get('id'))

entries = [
self.url_result(video_url, ViceIE.ie_key())
for video_url, _ in re.findall(
r'<h2[^>]+class="article-title"[^>]+data-id="\d+"[^>]*>\s*<a[^>]+href="(%s.*?)"'
% ViceIE._VALID_URL, webpage)]
def _real_extract(self, url):
locale, display_id = re.match(self._VALID_URL, url).groups()
show = self._call_api('shows', 'slug', display_id, locale, '''dek
id
title''')[0]
show_id = show['id']

title = self._search_regex(
r'<title>(.+?)</title>', webpage, 'title', default=None)
if title:
title = re.sub(r'(.+)\s*\|\s*.+$', r'\1', title).strip()
description = self._html_search_meta(
'description', webpage, 'description')
entries = OnDemandPagedList(
functools.partial(self._fetch_page, locale, show_id),
self._PAGE_SIZE)

return self.playlist_result(entries, show_id, title, description)
return self.playlist_result(
entries, show_id, show.get('title'), show.get('dek'))


class ViceArticleIE(InfoExtractor):
class ViceArticleIE(ViceBaseIE):
IE_NAME = 'vice:article'
_VALID_URL = r'https://www\.vice\.com/[^/]+/article/(?P<id>[^?#]+)'
_VALID_URL = r'https://(?:www\.)?vice\.com/(?P<locale>[^/]+)/article/(?:[0-9a-z]{6}/)?(?P<id>[^?#]+)'

_TESTS = [{
'url': 'https://www.vice.com/en_us/article/on-set-with-the-woman-making-mormon-porn-in-utah',
'info_dict': {
'id': '41eae2a47b174a1398357cec55f1f6fc',
'id': '58dc0a3dee202d2a0ccfcbd8',
'ext': 'mp4',
'title': 'Mormon War on Porn ',
'description': 'md5:6394a8398506581d0346b9ab89093fef',
'title': 'Mormon War on Porn',
'description': 'md5:1c5d91fe25fa8aa304f9def118b92dbf',
'uploader': 'vice',
'uploader_id': '57a204088cb727dec794c67b',
'timestamp': 1491883129,
Expand All @@ -258,10 +260,10 @@ class ViceArticleIE(InfoExtractor):
# AES-encrypted m3u8
'skip_download': True,
},
'add_ie': ['UplynkPreplay'],
'add_ie': [ViceIE.ie_key()],
}, {
'url': 'https://www.vice.com/en_us/article/how-to-hack-a-car',
'md5': '7fe8ebc4fa3323efafc127b82bd821d9',
'md5': '13010ee0bc694ea87ec40724397c2349',
'info_dict': {
'id': '3jstaBeXgAs',
'ext': 'mp4',
Expand All @@ -271,15 +273,15 @@ class ViceArticleIE(InfoExtractor):
'uploader_id': 'MotherboardTV',
'upload_date': '20140529',
},
'add_ie': ['Youtube'],
'add_ie': [YoutubeIE.ie_key()],
}, {
'url': 'https://www.vice.com/en_us/article/znm9dx/karley-sciortino-slutever-reloaded',
'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2',
'info_dict': {
'id': 'e2ed435eb67e43efb66e6ef9a6930a88',
'id': '57f41d3556a0a80f54726060',
'ext': 'mp4',
'title': "Making The World's First Male Sex Doll",
'description': 'md5:916078ef0e032d76343116208b6cc2c4',
'description': 'md5:19b00b215b99961cf869c40fbe9df755',
'uploader': 'vice',
'uploader_id': '57a204088cb727dec794c67b',
'timestamp': 1476919911,
Expand All @@ -288,6 +290,7 @@ class ViceArticleIE(InfoExtractor):
},
'params': {
'skip_download': True,
'format': 'bestvideo',
},
'add_ie': [ViceIE.ie_key()],
}, {
Expand All @@ -299,14 +302,11 @@ class ViceArticleIE(InfoExtractor):
}]

def _real_extract(self, url):
display_id = self._match_id(url)

webpage = self._download_webpage(url, display_id)
locale, display_id = re.match(self._VALID_URL, url).groups()

prefetch_data = self._parse_json(self._search_regex(
r'__APP_STATE\s*=\s*({.+?})(?:\s*\|\|\s*{}\s*)?;\s*\n',
webpage, 'app state'), display_id)['pageData']
body = prefetch_data['body']
article = self._call_api('articles', 'slug', display_id, locale, '''body
embed_code''')[0]
body = article['body']

def _url_res(video_url, ie_key):
return {
Expand All @@ -316,7 +316,7 @@ def _url_res(video_url, ie_key):
'ie_key': ie_key,
}

vice_url = ViceIE._extract_url(webpage)
vice_url = ViceIE._extract_url(body)
if vice_url:
return _url_res(vice_url, ViceIE.ie_key())

Expand All @@ -332,6 +332,6 @@ def _url_res(video_url, ie_key):

video_url = self._html_search_regex(
r'data-video-url="([^"]+)"',
prefetch_data['embed_code'], 'video URL')
article['embed_code'], 'video URL')

return _url_res(video_url, ViceIE.ie_key())

0 comments on commit 44b434e

Please sign in to comment.