-booruScraper/util/WebRequest/Handlers.py

148 lines
4.2 KiB
Python

#!/usr/bin/python3
import sys
import codecs
import http.client
import email.parser
import urllib.request
import urllib.parse
import urllib.error
import os.path
import time
import http.cookiejar
import traceback
import logging
import zlib
import bs4
import re
import string
import gzip
import io
import socket
import json
import base64
import random
class HeadRequest(urllib.request.Request):
def get_method(self):
# Apparently HEAD is now being blocked. Because douche.
return "GET"
# return "HEAD"
class HTTPRedirectBlockerErrorHandler(urllib.request.HTTPErrorProcessor): # pragma: no cover
def http_response(self, request, response):
code, msg, hdrs = response.code, response.msg, response.info()
# only add this line to stop 302 redirection.
if code == 302:
print("Code!", 302)
return response
if code == 301:
print("Code!", 301)
return response
print("[HTTPRedirectBlockerErrorHandler] http_response! code:", code)
print(hdrs)
print(msg)
if not (200 <= code < 300):
response = self.parent.error('http', request, response, code, msg, hdrs)
return response
https_response = http_response
# Custom redirect handler to work around
# issue https://bugs.python.org/issue17214
class HTTPRedirectHandler(urllib.request.HTTPRedirectHandler):
# Implementation note: To avoid the server sending us into an
# infinite loop, the request object needs to track what URLs we
# have already seen. Do this by adding a handler-specific
# attribute to the Request object.
def http_error_302(self, req, fp, code, msg, headers):
# Some servers (incorrectly) return multiple Location headers
# (so probably same goes for URI). Use first header.
if "location" in headers:
newurl = headers["location"]
elif "uri" in headers:
newurl = headers["uri"]
else:
return
# fix a possible malformed URL
urlparts = urllib.parse.urlparse(newurl)
# For security reasons we don't allow redirection to anything other
# than http, https or ftp.
if urlparts.scheme not in ('http', 'https', 'ftp', ''):
raise urllib.error.HTTPError(
newurl, code,
"%s - Redirection to url '%s' is not allowed" % (msg, newurl),
headers, fp)
if not urlparts.path:
urlparts = list(urlparts)
urlparts[2] = "/"
newurl = urllib.parse.urlunparse(urlparts)
# http.client.parse_headers() decodes as ISO-8859-1. Recover the
# original bytes and percent-encode non-ASCII bytes, and any special
# characters such as the space.
newurl = urllib.parse.quote(
newurl, encoding="iso-8859-1", safe=string.punctuation)
newurl = urllib.parse.urljoin(req.full_url, newurl)
# XXX Probably want to forget about the state of the current
# request, although that might interact poorly with other
# handlers that also use handler-specific request attributes
new = self.redirect_request(req, fp, code, msg, headers, newurl)
if new is None: # pragma: no cover
return
# loop detection
# .redirect_dict has a key url if url was previously visited.
if hasattr(req, 'redirect_dict'):
visited = new.redirect_dict = req.redirect_dict
if (visited.get(newurl, 0) >= self.max_repeats or
len(visited) >= self.max_redirections):
raise urllib.error.HTTPError(req.full_url, code,
self.inf_msg + msg, headers, fp)
else:
visited = new.redirect_dict = req.redirect_dict = {}
visited[newurl] = visited.get(newurl, 0) + 1
# Don't close the fp until we are sure that we won't use it
# with HTTPError.
fp.read()
fp.close()
return self.parent.open(new, timeout=req.timeout)
class PreemptiveBasicAuthHandler(urllib.request.HTTPBasicAuthHandler):
'''Preemptive basic auth.
Instead of waiting for a 403 to then retry with the credentials,
send the credentials if the url is handled by the password manager.
Note: please use realm=None when calling add_password.'''
def http_request(self, req):
url = req.get_full_url()
realm = None
# this is very similar to the code from retry_http_basic_auth()
# but returns a request object.
user, pw = self.passwd.find_user_password(realm, url)
if pw:
raw = "%s:%s" % (user, pw)
raw = raw.encode("ascii")
auth = b'Basic ' + base64.standard_b64encode(raw).strip()
req.add_unredirected_header(self.auth_header, auth)
return req
https_request = http_request