'OSError: [Errno 22} Invalid argument: 'downloaded/misc/jquery.js?v=1.4.4'
tfp = open(filename, 'wb')
OSError: [Errno 22} Invalid argument: 'downloaded/misc/jquery.js?v=1.4.4'
Can anyone help me with this error? I figure it has something to do with jquery.js?v=1.4.4
not being valid. I am new at python; I apologize if I am missing something obvious.
Here is the code:
import os
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
downloadDirectory = "downloaded"
baseUrl = "http://pythonscraping.com"
def getAbsoluteURL(baseUrl, source):
if source.startswith("http://www."):
url = "http://"+source[11:]
elif source.startswith("http://"):
url = source
elif source.startswith("www."):
url = source[4:]
url = "http://"+source
else:
url = baseUrl+"/"+source
if baseUrl not in url:
return None
return url
def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory):
path = absoluteUrl.replace("www.", "")
path = path.replace(baseUrl, "")
path = downloadDirectory+path
directory = os.path.dirname(path)
if not os.path.exists(directory):
os.makedirs(directory)
return path
html = urlopen("http://www.pythonscraping.com")
bsObj = BeautifulSoup(html, "html.parser")
downloadList = bsObj.findAll(src=True)
for download in downloadList:
fileUrl = getAbsoluteURL(baseUrl, download["src"])
if fileUrl is not None:
print(fileUrl)
urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory))
Solution 1:[1]
For the function urlretrieve(url, filename, reporthook, data)
,
the argument you give for the filename
parameter needs to be a valid file name on your operating system.
In this case, when you run
urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory))
the argument you give for url
is "http://pythonscraping.com/misc/jquery.js?v=1.4.4", and the argument you give for filename
is "downloaded/misc/jquery.js?v=1.4.4".
"jquery.js?v=1.4.4" I believe is not a valid file name.
Solution: in the getDownloadPath
function, change return path
to
return path.partition('?')[0]
Solution 2:[2]
downloaded/misc/jquery.js?v=1.4.4 is not a valid file name. I think a better solution like this:
import requests
from bs4 import BeautifulSoup
download_directory = "downloaded"
base_url = "http://www.pythonscraping.com/"
# Use Requests instead urllib
def get_files_url(base_url):
# Return a list of tag elements that contain src attrs
html = requests.get(base_url)
soup = BeautifulSoup(html.text, "lxml")
return soup.find_all(src=True)
def get_file_name(url):
# Return the last part after the last "/" as file name
# Eg: return a.png as file name if url=http://pythonscraping.com/a.png
# Remove characters not valid in file name
file_name = url.split("/")[-1]
remove_list = "?><\/:\"*|"
for ch in remove_list:
if ch in file_name:
file_name = file_name.replace(ch, "")
return download_directory + "/" + file_name
def get_formatted_url(url):
if not url.startswith("http://"):
return base_url + url
elif base_url not in url:
return None
else:
return url
links = get_files_url(base_url)
for link in links:
url = link["src"]
url = get_formatted_url(url)
if url is None:
continue
print(url)
result = requests.get(url, stream=True)
file_name = get_file_name(url)
print(file_name)
with open(file_name, 'wb') as f:
for chunk in result.iter_content(10):
f.write(chunk)
Solution 3:[3]
You can modify the getDownloadPath function like this:
def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory):
remove_list = '?<>\/:\"*|'
for ch in remove_list:
if ch in absoluteUrl:
absoluteUrl = absoluteUrl.replace(ch, '')
path = absoluteUrl.replace('www.', '')
path = path.replace(baseUrl, '')
path = downloadDirectory + '/' + path
directory = os.path.dirname(path)
if not os.path.exists(directory):
os.makedirs(directory)
return path
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|---|
Solution 1 | Jacob Liu |
Solution 2 | |
Solution 3 |