Does anyone have a local copy of zompist.com?
Posted: Tue Mar 22, 2022 7:41 pm
Just curious because I'm currently making one by copying the source (copy as in Ctrl+c Ctrl+v) into HTML files, as well as downloading the required images.
Crossing our fingers
https://verduria.org/
On Firefox, for single pages at least, it would probably be simpler to just do CTRL+S and select "Web page, complete". This should automatically download all the files used by a particular page.WarpedWartWars wrote: ↑Tue Mar 22, 2022 7:41 pm I'm currently making one by copying the source (copy as in Ctrl+c Ctrl+v) into HTML files, as well as downloading the required images.
It's a program that basically does this:
But you have to be careful that you don't accidentally download the entire Internet in the process.mocha wrote: ↑Fri Apr 01, 2022 11:14 amIt's a program that basically does this:
(1) Download webpage
(2) Look for links on webpage
(3) Goto (1) with new webpage(s)
Fairly simple to code and allows you to download a website fairly quickly (at least, all linked pages...)
I'd like to code it in Python, but the thing I'm having trouble with is step 2.
Code: Select all
<a href="relative/path/with/no/explicit/domain.txt"></a>
Code: Select all
import os
from urllib.request import urlopen
import re
def _load(url):
return urlopen(url).read() #.decode(errors="backslashreplace")
def load(url):
try:
return (file := _load(url)).decode()
except UnicodeDecodeError:
return file
def get_links(url, page):
narrowed = []
for (_, _, link) in re.findall(r"""(href|src)=(?P<quote>['"])(?P<url>.*?)(?P=quote)""",
page, re.IGNORECASE):
curr = ["https://" + domain(url)]
if link.startswith("http"):
if domain(link) == domain(url):
if "/" in (rest := nondomain(link)):
curr.append(rest.split("/"))
else:
curr.append(rest)
elif "/" in link:
curr.append(link.split("/"))
else:
curr.append(link)
narrowed.append("/".join(curr))
return narrowed
def save_file(path, file):
print("saving '" + path + "'...")
if "/" in path:
os.makedirs("/".join(path.split("/")[:-1]))
with open(path, "w" + isinstance(file, bytes) * "b") as f:
f.write(file)
def dhelp(url):
return (url.lstrip("qwertyuiopasdfghjklzxcvbnm").lstrip(":/")
if "://" in url else url).split("/")
def domain(url):
return dhelp(url)[0]
def nondomain(url):
return ("/".join(dhelp(url)[1:]) if len(dhelp(url)) else "")
def _main(url):
global done
if url in done:
return
done.append(url)
page = load(url)
save_file(nondomain(url), page)
for link in get_links(url, page):
_main(link)
def main(url):
global done
done = []
os.mkdir(domain(url))
os.chdir(domain(url))
_main((url + "index.html") if not nondomain(url) else url)
Code: Select all
>>> main("https://zompist.com/")
saving 'index.html'...
Traceback (most recent call last):
File "<pyshell#32>", line 1, in <module>
main("https://zompist.com/")
File "C:\Users\<user>\Desktop\py\webscrape.py", line 65, in main
_main((url + "index.html") if not nondomain(url) else url)
File "C:\Users\<user>\Desktop\py\webscrape.py", line 56, in _main
save_file(nondomain(url), page)
File "C:\Users\<user>\Desktop\py\webscrape.py", line 37, in save_file
f.write(file)
File "C:\Users\<user>\AppData\Local\Programs\Python\Python310\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\u0263' in position 4166: character maps to <undefined>
Because I'm lazy my solution to this was to simply ignore the errors:WarpedWartWars wrote: ↑Wed Apr 06, 2022 9:17 pm but:Code: Select all
UnicodeEncodeError: 'charmap' codec can't encode character '\u0263' in position 4166: character maps to <undefined>
Code: Select all
file.write(src, encode='utf-8', errors='ignore')
Now I'm gettingmocha wrote: ↑Thu Apr 07, 2022 6:47 pm Because I'm lazy my solution to this was to simply ignore the errors:
I might be losing random characters here and there, but if I am, I don't see them, which is good enough for me!Code: Select all
file.write(src, encode='utf-8', errors='ignore')
Code: Select all
Traceback (most recent call last):
File "<pyshell#0>", line 1, in <module>
main("https://www.zompist.com/")
File "C:\Users\<user>\Desktop\py\webscrape.py", line 64, in main
_main((url + "index.html") if not nondomain(url) else url)
File "C:\Users\<user>\Desktop\py\webscrape.py", line 55, in _main
save_file(nondomain(url), page)
File "C:\Users\<user>\Desktop\py\webscrape.py", line 37, in save_file
f.write(file, errors="ignore")
TypeError: TextIOWrapper.write() takes no keyword arguments
This error message:WarpedWartWars wrote: ↑Fri Apr 08, 2022 3:41 amI'm using 3.10.2. What might it be that's causing it?
Code: Select all
TypeError: TextIOWrapper.write() takes no keyword arguments
Maybe I could put the "errors" kwarg in "open(...)"...alice wrote: ↑Fri Apr 08, 2022 1:22 pmThis error message:WarpedWartWars wrote: ↑Fri Apr 08, 2022 3:41 amI'm using 3.10.2. What might it be that's causing it?
suggests that mocha's version of TextIOWrapper is not the same as yours. That's pretty much all I can say.Code: Select all
TypeError: TextIOWrapper.write() takes no keyword arguments
Code: Select all
import os
from urllib.request import urlopen
import re
def _load(url):
return urlopen(url).read() #.decode(errors="backslashreplace")
def load(url):
try:
return (file := _load(url)).decode()
except UnicodeDecodeError:
return file
def get_links(url, page):
narrowed = []
for (_, _, link) in re.findall(r"""(href|src)=(?P<quote>['"])(?P<url>.*?)(?P=quote)""",
page, re.IGNORECASE):
if not nondomain(link):
continue
curr = ["https://" + domain(url)]
if link.startswith("http"):
if domain(link) == domain(url):
if "/" in (rest := nondomain(link)):
curr += rest.split("/")
else:
curr += [rest]
elif "/" in link:
curr += link.split("/")
else:
curr += [link]
narrowed.append("/".join(curr))
return narrowed
def save_file(path, file):
print("saving '" + path + "'...")
if "/" in path:
os.makedirs("/".join(path.split("/")[:-1]))
with (open(path, "wb")
if isinstance(file, bytes)
else open(path, "w", errors="ignore")) as f:
f.write(file)
def dhelp(url):
return (url.lstrip("qwertyuiopasdfghjklzxcvbnm").lstrip(":/")
if "://" in url else url).split("/")
def domain(url):
return dhelp(url)[0]
def nondomain(url):
return ("/".join(dhelp(url)[1:]) if len(dhelp(url)) else "")
def _main(url):
global done
if url in done:
return
done.append(url)
print(url)
page = load(url)
save_file(nondomain(url), page)
if not isinstance(page, bytes):
for link in get_links(url, page):
if nondomain(link):
_main(link)
def main(url):
global done
done = []
os.mkdir(domain(url))
os.chdir(domain(url))
_main((url + "index.html") if not nondomain(url) else url)
Code: Select all
>>> main("https://zompist.com/")
https://zompist.com/index.html
saving 'index.html'...
https://zompist.com/illo/zbblogo.gif
saving 'illo/zbblogo.gif'...
https://zompist.com/mars/index.html
saving 'mars/index.html'...
https://zompist.com/../incatena.html
Traceback (most recent call last):
File "<pyshell#18>", line 1, in <module>
main("https://zompist.com/")
File "C:\Users\<user>\Desktop\py\webscrape.py", line 71, in main
_main((url + "index.html") if not nondomain(url) else url)
File "C:\Users\<user>\Desktop\py\webscrape.py", line 64, in _main
_main(link)
File "C:\Users\<user>\Desktop\py\webscrape.py", line 64, in _main
_main(link)
File "C:\Users\<user>\Desktop\py\webscrape.py", line 59, in _main
page = load(url)
File "C:\Users\<user>\Desktop\py\webscrape.py", line 10, in load
return (file := _load(url)).decode()
File "C:\Users\<user>\Desktop\py\webscrape.py", line 6, in _load
return urlopen(url).read() #.decode(errors="backslashreplace")
File "C:\Users\<user>\AppData\Local\Programs\Python\Python310\lib\urllib\request.py", line 216, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\<user>\AppData\Local\Programs\Python\Python310\lib\urllib\request.py", line 525, in open
response = meth(req, response)
File "C:\Users\<user>\AppData\Local\Programs\Python\Python310\lib\urllib\request.py", line 634, in http_response
response = self.parent.error(
File "C:\Users\<user>\AppData\Local\Programs\Python\Python310\lib\urllib\request.py", line 563, in error
return self._call_chain(*args)
File "C:\Users\<user>\AppData\Local\Programs\Python\Python310\lib\urllib\request.py", line 496, in _call_chain
result = func(*args)
File "C:\Users\<user>\AppData\Local\Programs\Python\Python310\lib\urllib\request.py", line 643, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 400: Bad Request
Code: Select all
import os
from urllib.request import urlopen
from urllib.error import HTTPError, URLError
from http.client import InvalidURL
import sys
import re
def istextlike(url):
return any(url.lower().endswith(ext) for ext in
(".txt",".md",".html",".htm",".shtml"))
def _load(url):
return urlopen(url).read() #.decode(errors="backslashreplace")
def load(url):
c = _load(url)
if istextlike(url):
c = re.sub(r"\\x([0-9a-f]{2})",lambda m:chr(int(m[1],16)),
c.decode(errors='backslashreplace'))
return c
def get_links(url, page):
narrowed = []
for match in re.findall(r"""(?:
(?:href|src)=
(?P<quote1>['"])
(?P<url1>.*?)
(?P=quote1)
)|
(?:
(?:url|open)\(
(?P<quote2>['"])
(?P<url2>.*?)
(?P=quote2)
\)
)|
(?:
name\s*=\s*
(?P<quote3>['"])
(?P<url3>.*?)
(?P=quote3)
)""",
page, re.IGNORECASE | re.VERBOSE):
link = match[1] or match[3] or match[5]
if ("#" in link or
"?" in link):
continue
relurl = nondomain(url).split("/")[:-1]
curr = ["http://"+domain(url)]
while link.startswith("."):
relurl = relurl[:-1]
link = "/".join(link.split("/")[1:])
curr += relurl
if link.startswith("http"):
if domain(link) == domain(url):
curr = [curr[0]] + nondomain(link).split("/")
else:
continue
elif ("://" in link or
link.startswith("mailto:")):
continue
elif "/" in link:
if link.startswith("/"):
curr = [curr[0]]
link = link[1:]
curr += link.split("/")
else:
curr += [link]
narrowed.append("/".join(curr))
return narrowed
def save_file(path, file):
path = path.strip()
if path[-1] == "/":
path += "index.html"
print("saving '" + path + "'...")
if "/" in path:
os.makedirs("/".join(path.split("/")[:-1]), exist_ok=True)
try:
with (open(path, "wb")) as f:
#if isinstance(file, bytes)
#else open(path, "w", errors="ignore")) as f:
f.write(file if isinstance(file, bytes)
else file.encode())
except PermissionError:
pass
def dhelp(url):
return (url.lstrip("qwertyuiopasdfghjklzxcvbnm").lstrip(":/")
if "://" in url else #"./"+
url).split("/")
def domain(url):
return dhelp(url)[0]
def nondomain(url):
return ("/".join(dhelp(url)[1:]) if len(dhelp(url)) else "")
def _main(todo, done):
url = todo.pop(0)
if url in done:
return
done.append(url)
print(url)
try:
page = load(url)
except HTTPError as err:
if err.status == 404:
print("404: " + url, file=sys.stderr)
return
raise
except URLError as err:
if True:#err.errno == -3:
print("Failed, no internet probably")
return
except InvalidURL as err:
print(err, file=sys.stderr)
return
save_file(nondomain(url), page)
if isinstance(page, bytes):
return
for link in get_links(url, page):
if nondomain(link):
todo.append(link)#_main(link)
def main(url):
done = []
todo = [url]
os.makedirs(domain(url).replace(":", "_"), exist_ok=True)
os.chdir(domain(url).replace(":", "_"))
try:
while len(todo)>0:
_main(todo, done) #(url + "/index.html") if not nondomain(url) else url)
finally:
os.chdir("../")
Code: Select all
import os
from urllib.request import urlopen
from urllib.error import HTTPError, URLError
from http.client import InvalidURL
import sys
import re
def istextlike(url):
return any(url.lower().endswith(ext) for ext in
(".txt",".md",".html",".htm",".shtml"))
def _load(url):
try:
return urlopen(url).read() #.decode(errors="backslashreplace")
except UnicodeEncodeError:
raise URLError("invalid char in url")
def load(url):
c = _load(url)
if istextlike(url):
c = re.sub(r"\\x([0-9a-f]{2})",lambda m:chr(int(m[1],16)),
c.decode(errors='backslashreplace'))
return c
def loadlocal(url, file):
c = file.read()
if istextlike(url):
c = re.sub(r"\\x([0-9a-f]{2})",lambda m:chr(int(m[1],16)),
c.decode(errors='backslashreplace'))
return c
def get_links(url, page):
narrowed = []
for match in re.findall(r"""(?:
(?:href|src)=
(?P<quote1>['"])
(?P<url1>.*?)
(?P=quote1)
)|
(?:
(?:url|open)\(
(?P<quote2>['"])
(?P<url2>.*?)
(?P=quote2)
\)
)|
(?:
name\s*=\s*
(?P<quote3>['"])
(?P<url3>.*?)
(?P=quote3)
)""",
page, re.IGNORECASE | re.VERBOSE):
link = match[1] or match[3] or match[5]
if ("#" in link or
"?" in link):
continue
relurl = nondomain(url).split("/")[:-1]
curr = ["http://"+domain(url)]
while link.startswith("."):
relurl = relurl[:-1]
link = "/".join(link.split("/")[1:])
curr += relurl
if link.startswith("http"):
if domain(link) == domain(url):
curr = [curr[0]] + nondomain(link).split("/")
else:
continue
elif "." not in link:
continue
elif ("://" in link or
link.startswith("mailto:")):
continue
elif "/" in link:
if link.startswith("/"):
curr = [curr[0]]
link = link[1:]
curr += link.split("/")
else:
curr += [link]
narrowed.append("/".join(curr))
return narrowed
def save_file(path, file):
path = sanitizefilename(path)
print("saving '" + path + "'...")
if "/" in path:
os.makedirs("/".join(path.split("/")[:-1]), exist_ok=True)
try:
with (open(path, "wb")) as f:
#if isinstance(file, bytes)
#else open(path, "w", errors="ignore")) as f:
f.write(file if isinstance(file, bytes)
else file.encode())
except PermissionError:
pass
def sanitizefilename(path):
path = path.strip()
if path[-1] == "/":
path += "index.html"
return path.replace(":", "_")
def dhelp(url):
return (url.lstrip("qwertyuiopasdfghjklzxcvbnm").lstrip(":/")
if "://" in url else #"./"+
url).split("/")
def domain(url):
return dhelp(url)[0]
def nondomain(url):
return ("/".join(dhelp(url)[1:]) if len(dhelp(url)) else "")
def _main(todo, done):
url = todo.pop(0)
if url in done:
return
done.append(url)
print(url)
if os.path.exists(sanitizefilename(nondomain(url))):
p = open(sanitizefilename(nondomain(url)), "rb")
page = loadlocal(sanitizefilename(nondomain(url)), p)
else:
try:
page = load(url)
except HTTPError as err:
if err.status//100==4:
print(str(err.status) + ": " + url, file=sys.stderr)
return
raise
except URLError as err:
if True:#err.errno == -3:
print("Failed with URLError: " + str(err.reason), file=sys.stderr)
return
except InvalidURL as err:
print(err, file=sys.stderr)
return
if ((isinstance(page, str) and "</address>" not in page)
or isinstance(page, bytes)):
save_file(nondomain(url), page)
if isinstance(page, bytes):
return
for link in get_links(url, page):
if nondomain(link):
todo.append(link)#_main(link)
def main(url):
done = []
todo = [url]
os.makedirs(sanitizefilename(domain(url)), exist_ok=True)
os.chdir(sanitizefilename(domain(url)))
try:
while len(todo)>0:
_main(todo, done) #(url + "/index.html") if not nondomain(url) else url)
finally:
os.chdir("../")