Skip to content

Commit

Permalink
Improve URL extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
Sergey M․ committed Jul 21, 2018
1 parent 4ecf300 commit 3052a30
Show file tree
Hide file tree
Showing 47 changed files with 166 additions and 139 deletions.
3 changes: 2 additions & 1 deletion youtube_dl/extractor/adultswim.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from ..utils import (
int_or_none,
strip_or_none,
url_or_none,
)


Expand Down Expand Up @@ -98,7 +99,7 @@ def _real_extract(self, url):
if not video_id:
entries = []
for episode in video_data.get('archiveEpisodes', []):
episode_url = episode.get('url')
episode_url = url_or_none(episode.get('url'))
if not episode_url:
continue
entries.append(self.url_result(
Expand Down
3 changes: 2 additions & 1 deletion youtube_dl/extractor/afreecatv.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
determine_ext,
ExtractorError,
int_or_none,
url_or_none,
urlencode_postdata,
xpath_text,
)
Expand Down Expand Up @@ -304,7 +305,7 @@ def _real_extract(self, url):
file_elements = video_element.findall(compat_xpath('./file'))
one = len(file_elements) == 1
for file_num, file_element in enumerate(file_elements, start=1):
file_url = file_element.text
file_url = url_or_none(file_element.text)
if not file_url:
continue
key = file_element.get('key', '')
Expand Down
15 changes: 8 additions & 7 deletions youtube_dl/extractor/amp.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@

from .common import InfoExtractor
from ..utils import (
int_or_none,
parse_iso8601,
mimetype2ext,
determine_ext,
ExtractorError,
int_or_none,
mimetype2ext,
parse_iso8601,
url_or_none,
)


Expand Down Expand Up @@ -35,7 +36,7 @@ def get_media_node(name, default=None):
media_thumbnail = [media_thumbnail]
for thumbnail_data in media_thumbnail:
thumbnail = thumbnail_data.get('@attributes', {})
thumbnail_url = thumbnail.get('url')
thumbnail_url = url_or_none(thumbnail.get('url'))
if not thumbnail_url:
continue
thumbnails.append({
Expand All @@ -51,7 +52,7 @@ def get_media_node(name, default=None):
media_subtitle = [media_subtitle]
for subtitle_data in media_subtitle:
subtitle = subtitle_data.get('@attributes', {})
subtitle_href = subtitle.get('href')
subtitle_href = url_or_none(subtitle.get('href'))
if not subtitle_href:
continue
subtitles.setdefault(subtitle.get('lang') or 'en', []).append({
Expand All @@ -65,7 +66,7 @@ def get_media_node(name, default=None):
media_content = [media_content]
for media_data in media_content:
media = media_data.get('@attributes', {})
media_url = media.get('url')
media_url = url_or_none(media.get('url'))
if not media_url:
continue
ext = mimetype2ext(media.get('type')) or determine_ext(media_url)
Expand All @@ -79,7 +80,7 @@ def get_media_node(name, default=None):
else:
formats.append({
'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'),
'url': media['url'],
'url': media_url,
'tbr': int_or_none(media.get('bitrate')),
'filesize': int_or_none(media.get('fileSize')),
'ext': ext,
Expand Down
3 changes: 2 additions & 1 deletion youtube_dl/extractor/animeondemand.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
determine_ext,
extract_attributes,
ExtractorError,
url_or_none,
urlencode_postdata,
urljoin,
)
Expand Down Expand Up @@ -165,7 +166,7 @@ def extract_info(html, video_id, num=None):
}, fatal=False)
if not playlist:
continue
stream_url = playlist.get('streamurl')
stream_url = url_or_none(playlist.get('streamurl'))
if stream_url:
rtmp = re.search(
r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+/))(?P<playpath>mp[34]:.+)',
Expand Down
3 changes: 2 additions & 1 deletion youtube_dl/extractor/aol.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from ..utils import (
ExtractorError,
int_or_none,
url_or_none,
)


Expand Down Expand Up @@ -77,7 +78,7 @@ def _real_extract(self, url):
formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
for rendition in video_data.get('renditions', []):
video_url = rendition.get('url')
video_url = url_or_none(rendition.get('url'))
if not video_url:
continue
ext = rendition.get('format')
Expand Down
6 changes: 3 additions & 3 deletions youtube_dl/extractor/apa.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
import re

from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
determine_ext,
js_to_json,
url_or_none,
)


Expand Down Expand Up @@ -68,8 +68,8 @@ def _real_extract(self, url):
for source in sources:
if not isinstance(source, dict):
continue
source_url = source.get('file')
if not source_url or not isinstance(source_url, compat_str):
source_url = url_or_none(source.get('file'))
if not source_url:
continue
ext = determine_ext(source_url)
if ext == 'm3u8':
Expand Down
3 changes: 2 additions & 1 deletion youtube_dl/extractor/aparat.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from ..utils import (
int_or_none,
mimetype2ext,
url_or_none,
)


Expand Down Expand Up @@ -43,7 +44,7 @@ def _real_extract(self, url):

formats = []
for item in file_list[0]:
file_url = item.get('file')
file_url = url_or_none(item.get('file'))
if not file_url:
continue
ext = mimetype2ext(item.get('type'))
Expand Down
4 changes: 2 additions & 2 deletions youtube_dl/extractor/ard.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from .common import InfoExtractor
from .generic import GenericIE
from ..compat import compat_str
from ..utils import (
determine_ext,
ExtractorError,
Expand All @@ -15,6 +14,7 @@
unified_strdate,
xpath_text,
update_url_query,
url_or_none,
)
from ..compat import compat_etree_fromstring

Expand Down Expand Up @@ -100,7 +100,7 @@ def _extract_formats(self, media_info, video_id):
quality = stream.get('_quality')
server = stream.get('_server')
for stream_url in stream_urls:
if not isinstance(stream_url, compat_str) or '//' not in stream_url:
if not url_or_none(stream_url):
continue
ext = determine_ext(stream_url)
if quality != 'auto' and ext in ('f4m', 'm3u8'):
Expand Down
7 changes: 4 additions & 3 deletions youtube_dl/extractor/bandcamp.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
unescapeHTML,
update_url_query,
unified_strdate,
url_or_none,
)


Expand Down Expand Up @@ -131,8 +132,8 @@ def _real_extract(self, url):
fatal=False)
if not stat:
continue
retry_url = stat.get('retry_url')
if not isinstance(retry_url, compat_str):
retry_url = url_or_none(stat.get('retry_url'))
if not retry_url:
continue
formats.append({
'url': self._proto_relative_url(retry_url, 'http:'),
Expand Down Expand Up @@ -306,7 +307,7 @@ def _real_extract(self, url):

formats = []
for format_id, format_url in show['audio_stream'].items():
if not isinstance(format_url, compat_str):
if not url_or_none(format_url):
continue
for known_ext in KNOWN_EXTENSIONS:
if known_ext in format_id:
Expand Down
10 changes: 6 additions & 4 deletions youtube_dl/extractor/breakcom.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@

from .common import InfoExtractor
from .youtube import YoutubeIE
from ..compat import compat_str
from ..utils import int_or_none
from ..utils import (
int_or_none,
url_or_none,
)


class BreakIE(InfoExtractor):
Expand Down Expand Up @@ -55,8 +57,8 @@ def _real_extract(self, url):

formats = []
for video in content:
video_url = video.get('url')
if not video_url or not isinstance(video_url, compat_str):
video_url = url_or_none(video.get('url'))
if not video_url:
continue
bitrate = int_or_none(self._search_regex(
r'(\d+)_kbps', video_url, 'tbr', default=None))
Expand Down
6 changes: 3 additions & 3 deletions youtube_dl/extractor/cammodels.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
from __future__ import unicode_literals

from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
ExtractorError,
int_or_none,
url_or_none,
)


Expand Down Expand Up @@ -56,8 +56,8 @@ def _real_extract(self, url):
for media in encodings:
if not isinstance(media, dict):
continue
media_url = media.get('location')
if not media_url or not isinstance(media_url, compat_str):
media_url = url_or_none(media.get('location'))
if not media_url:
continue

format_id_list = [format_id]
Expand Down
6 changes: 3 additions & 3 deletions youtube_dl/extractor/ccma.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
import re

from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
clean_html,
int_or_none,
parse_duration,
parse_iso8601,
parse_resolution,
url_or_none,
)


Expand Down Expand Up @@ -53,8 +53,8 @@ def _real_extract(self, url):
media_url = media['media']['url']
if isinstance(media_url, list):
for format_ in media_url:
format_url = format_.get('file')
if not format_url or not isinstance(format_url, compat_str):
format_url = url_or_none(format_.get('file'))
if not format_url:
continue
label = format_.get('label')
f = parse_resolution(label)
Expand Down
14 changes: 6 additions & 8 deletions youtube_dl/extractor/crackle.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,14 @@
import re

from .common import InfoExtractor
from ..compat import (
compat_str,
compat_HTTPError,
)
from ..compat import compat_HTTPError
from ..utils import (
determine_ext,
float_or_none,
int_or_none,
parse_age_limit,
parse_duration,
url_or_none,
ExtractorError
)

Expand Down Expand Up @@ -86,8 +84,8 @@ def _real_extract(self, url):
for e in media['MediaURLs']:
if e.get('UseDRM') is True:
continue
format_url = e.get('Path')
if not format_url or not isinstance(format_url, compat_str):
format_url = url_or_none(e.get('Path'))
if not format_url:
continue
ext = determine_ext(format_url)
if ext == 'm3u8':
Expand Down Expand Up @@ -124,8 +122,8 @@ def _real_extract(self, url):
for cc_file in cc_files:
if not isinstance(cc_file, dict):
continue
cc_url = cc_file.get('Path')
if not cc_url or not isinstance(cc_url, compat_str):
cc_url = url_or_none(cc_file.get('Path'))
if not cc_url:
continue
lang = cc_file.get('Locale') or 'en'
subtitles.setdefault(lang, []).append({'url': cc_url})
Expand Down
7 changes: 4 additions & 3 deletions youtube_dl/extractor/dctp.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
float_or_none,
int_or_none,
unified_timestamp,
url_or_none,
)


Expand Down Expand Up @@ -69,7 +70,7 @@ def _real_extract(self, url):
endpoint = next(
server['endpoint']
for server in servers
if isinstance(server.get('endpoint'), compat_str) and
if url_or_none(server.get('endpoint')) and
'cloudfront' in server['endpoint'])
else:
endpoint = 'rtmpe://s2pqqn4u96e4j8.cloudfront.net/cfx/st/'
Expand All @@ -92,8 +93,8 @@ def _real_extract(self, url):
for image in images:
if not isinstance(image, dict):
continue
image_url = image.get('url')
if not image_url or not isinstance(image_url, compat_str):
image_url = url_or_none(image.get('url'))
if not image_url:
continue
thumbnails.append({
'url': image_url,
Expand Down
7 changes: 3 additions & 4 deletions youtube_dl/extractor/discoverygo.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import re

from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
determine_ext,
extract_attributes,
Expand All @@ -12,6 +11,7 @@
parse_age_limit,
remove_end,
unescapeHTML,
url_or_none,
)


Expand Down Expand Up @@ -69,9 +69,8 @@ def _extract_video_info(self, video, stream, display_id):
captions = stream.get('captions')
if isinstance(captions, list):
for caption in captions:
subtitle_url = caption.get('fileUrl')
if (not subtitle_url or not isinstance(subtitle_url, compat_str) or
not subtitle_url.startswith('http')):
subtitle_url = url_or_none(caption.get('fileUrl'))
if not subtitle_url or not subtitle_url.startswith('http'):
continue
lang = caption.get('fileLang', 'en')
ext = determine_ext(subtitle_url)
Expand Down
Loading

0 comments on commit 3052a30

Please sign in to comment.