Source code for abpytools.utils.downloads
from urllib import request, error
[docs]class Download:
def __init__(self, url='', verbose=False, timeout=5):
self.url = url
self.verbose = verbose
self.html = ''
self.error = False
self.timeout = timeout
[docs] def download(self, user_agent='wswp', num_retries=2):
# self.html, self.error = download(self.url, self.verbose, user_agent=user_agent, num_retries=num_retries,
# timeout=self.timeout)
try:
self.html = download(self.url, self.verbose, user_agent=user_agent, num_retries=num_retries,
timeout=self.timeout)
except IOError:
raise ValueError("Could not download requested page.")
[docs]def download(url, verbose, user_agent='wswp', num_retries=2, decoding_format='utf-8', timeout=5):
"""
Function to download contents from a given url
Input:
url: str
string with the url to download from
user_agent: str
Default 'wswp'
num_retries: int
Number of times to retry downloading
if there is an error
verbose: bool
Print out url and errors
decoding: "utf-8"
Output:
returns: str
string with contents of given url
"""
# html_error = False
if verbose:
print('Downloading:', url)
headers = {'User-agent': user_agent}
request_obj = request.Request(url, headers=headers)
try:
with request.urlopen(request_obj, timeout=timeout) as response:
html = response.read()
except error.URLError as e:
if verbose:
print('Download error:', e.reason)
# html = None
# if num_retries > 0:
# if hasattr(e, 'code') and 500 <= e.code < 600:
# # retry 5XX HTTP errors
# return download(url, user_agent, num_retries - 1)[0]
# # elif hasattr(e, 'code') and e.code == 404:
# else:
# html_error = True
raise IOError(e.reason)
return html.decode(decoding_format)