-booruScraper/util/WebRequest/WebRequestClass.py

#!/usr/bin/python3
import urllib.request
import urllib.parse
import urllib.error


import os.path

import time
import http.cookiejar

import traceback

import logging
import zlib
import codecs
import re
import sys
import gzip
import io
import socket
import json

from threading import Lock

import bs4
try:
	import socks
	from sockshandler import SocksiPyHandler
	HAVE_SOCKS = True
except ImportError:
	HAVE_SOCKS = False

from . import HeaderParseMonkeyPatch

from . import ChromiumMixin
from . import PhantomJSMixin
from . import Handlers
from . import iri2uri
from . import Constants
from . import Exceptions

#pylint: disable-msg=E1101, C0325, R0201, W0702, W0703

COOKIEWRITELOCK = Lock()

GLOBAL_COOKIE_FILE = None

def as_soup(str):
	return bs4.BeautifulSoup(str, "lxml")


def determine_json_encoding(json_bytes):
	'''
	Given the fact that the first 2 characters in json are guaranteed to be ASCII, we can use
	these to determine the encoding.
	See: http://tools.ietf.org/html/rfc4627#section-3

	Copied here:
	   Since the first two characters of a JSON text will always be ASCII
	   characters [RFC0020], it is possible to determine whether an octet
	   stream is UTF-8, UTF-16 (BE or LE), or UTF-32 (BE or LE) by looking
	   at the pattern of nulls in the first four octets.

			   00 00 00 xx  UTF-32BE
			   00 xx 00 xx  UTF-16BE
			   xx 00 00 00  UTF-32LE
			   xx 00 xx 00  UTF-16LE
			   xx xx xx xx  UTF-8
	'''

	assert(isinstance(json_bytes, bytes))

	if len(json_bytes) > 4:
		b1, b2, b3, b4 = json_bytes[0], json_bytes[1], json_bytes[2], json_bytes[3]
		if   b1 == 0 and b2 == 0 and b3 == 0 and b4 != 0:
			return "UTF-32BE"
		elif b1 == 0 and b2 != 0 and b3 == 0 and b4 != 0:
			return "UTF-16BE"
		elif b1 != 0 and b2 == 0 and b3 == 0 and b4 == 0:
			return "UTF-32LE"
		elif b1 != 0 and b2 == 0 and b3 != 0 and b4 == 0:
			return "UTF-16LE"
		elif b1 != 0 and b2 != 0 and b3 != 0 and b4 != 0:
			return "UTF-8"
		else:
			raise Exceptions.ContentTypeError("Unknown encoding!")

	elif len(json_bytes) > 2:
		b1, b2 = json_bytes[0], json_bytes[1]
		if   b1 == 0 and b2 == 0:
			return "UTF-32BE"
		elif b1 == 0 and b2 != 0:
			return "UTF-16BE"
		elif b1 != 0 and b2 == 0:
			raise Exceptions.ContentTypeError("Json string too short to definitively infer encoding.")
		elif b1 != 0 and b2 != 0:
			return "UTF-8"
		else:
			raise Exceptions.ContentTypeError("Unknown encoding!")

	raise Exceptions.ContentTypeError("Input string too short to guess encoding!")


# A urllib2 wrapper that provides error handling and logging, as well as cookie management. It's a bit crude, but it works.
# Also supports transport compresion.
# OOOOLLLLLLDDDDD, has lots of creaky internals. Needs some cleanup desperately, but lots of crap depends on almost everything.
# Arrrgh.

class WebGetRobust(PhantomJSMixin.WebGetPjsMixin, ChromiumMixin.WebGetCrMixin):

	COOKIEFILE = 'cookies.lwp'				# the path and filename to save your cookies in
	cj = None
	cookielib = None
	opener = None

	errorOutCount = 2
	# retryDelay = 0.1
	retryDelay = 0.01

	data = None

	# creds is a list of 3-tuples that gets inserted into the password manager.
	# it is structured [(top_level_url1, username1, password1), (top_level_url2, username2, password2)]
	def __init__(self, creds=None, logPath="Main.WebRequest", cookie_lock=None,  cloudflare=False, use_socks=False, alt_cookiejar=None):
		super().__init__()

		self.rules = {}
		self.rules['cloudflare'] = cloudflare
		if cookie_lock:
			self.cookie_lock = cookie_lock
		else:
			self.cookie_lock = COOKIEWRITELOCK

		self.use_socks = use_socks
		# Override the global default socket timeout, so hung connections will actually time out properly.
		socket.setdefaulttimeout(5)

		self.log = logging.getLogger(logPath)
		# print("Webget init! Logpath = ", logPath)
		if creds:
			print("Have creds for a domain")

		# Due to general internet people douchebaggyness, I've basically said to hell with it and decided to spoof a whole assortment of browsers
		# It should keep people from blocking this scraper *too* easily
		self.browserHeaders = Constants.getUserAgent()

		self.data = urllib.parse.urlencode(self.browserHeaders)

		if creds:
			print("Have credentials, installing password manager into urllib handler.")
			passManager = urllib.request.HTTPPasswordMgrWithDefaultRealm()
			for url, username, password in creds:
				passManager.add_password(None, url, username, password)
			self.credHandler = Handlers.PreemptiveBasicAuthHandler(passManager)
		else:
			self.credHandler = None

		self.alt_cookiejar = alt_cookiejar
		self.__loadCookies()
	def chunkReport(self, bytesSoFar, totalSize):
		if totalSize:
			percent = float(bytesSoFar) / totalSize
			percent = round(percent * 100, 2)
			self.log.info("Downloaded %d of %d bytes (%0.2f%%)" % (bytesSoFar, totalSize, percent))
		else:
			self.log.info("Downloaded %d bytes" % (bytesSoFar))

	def __chunkRead(self, response, chunkSize=2 ** 18, reportHook=None):
		contentLengthHeader = response.info().getheader('Content-Length')
		if contentLengthHeader:
			totalSize = contentLengthHeader.strip()
			totalSize = int(totalSize)
		else:
			totalSize = None
		bytesSoFar = 0
		pgContent = ""
		while 1:
			chunk = response.read(chunkSize)
			pgContent += chunk
			bytesSoFar += len(chunk)

			if not chunk:
				break

			if reportHook:
				reportHook(bytesSoFar, chunkSize, totalSize)

		return pgContent

	def getSoupNoRedirects(self, *args, **kwargs):
		if 'returnMultiple' in kwargs:
			raise Exceptions.ArgumentError("getSoup cannot be called with 'returnMultiple'")

		if 'soup' in kwargs and kwargs['soup']:
			raise Exceptions.ArgumentError("getSoup contradicts the 'soup' directive!")

		kwargs['returnMultiple'] = True

		tgt_url = kwargs.get('requestedUrl', None)
		if not tgt_url:
			tgt_url = args[0]


		page, handle = self.getpage(*args, **kwargs)

		redirurl = handle.geturl()
		if redirurl != tgt_url:
			self.log.error("Requested %s, redirected to %s. Raising error", tgt_url, redirurl)

			raise Exceptions.RedirectedError("Requested %s, redirected to %s" % (
				tgt_url, redirurl))

		soup = as_soup(page)
		return soup

	def getSoup(self, *args, **kwargs):
		if 'returnMultiple' in kwargs and kwargs['returnMultiple']:
			raise Exceptions.ArgumentError("getSoup cannot be called with 'returnMultiple' being true")

		if 'soup' in kwargs and kwargs['soup']:
			raise Exceptions.ArgumentError("getSoup contradicts the 'soup' directive!")

		page = self.getpage(*args, **kwargs)
		if isinstance(page, bytes):
			raise Exceptions.ContentTypeError("Received content not decoded! Cannot parse!")

		soup = as_soup(page)
		return soup

	def getJson(self, *args, **kwargs):
		if 'returnMultiple' in kwargs and kwargs['returnMultiple']:
			raise Exceptions.ArgumentError("getSoup cannot be called with 'returnMultiple' being true")

		attempts = 0
		while 1:
			try:
				page = self.getpage(*args, **kwargs)
				if isinstance(page, bytes):
					page = page.decode(determine_json_encoding(page))
					# raise ValueError("Received content not decoded! Cannot parse!")

				page = page.strip()
				ret = json.loads(page)
				return ret
			except ValueError:
				if attempts < 1:
					attempts += 1
					self.log.error("JSON Parsing issue retreiving content from page!")
					for line in traceback.format_exc().split("\n"):
						self.log.error("%s", line.rstrip())
					self.log.error("Retrying!")

					# Scramble our current UA
					self.browserHeaders = Constants.getUserAgent()
					if self.alt_cookiejar:
						self.cj.init_agent(new_headers=self.browserHeaders)

					time.sleep(self.retryDelay)
				else:
					self.log.error("JSON Parsing issue, and retries exhausted!")
					# self.log.error("Page content:")
					# self.log.error(page)
					# with open("Error-ctnt-{}.json".format(time.time()), "w") as tmp_err_fp:
					# 	tmp_err_fp.write(page)
					raise

	def getFileAndName(self, *args, **kwargs):
		if 'returnMultiple' in kwargs:
			raise Exceptions.ArgumentError("getFileAndName cannot be called with 'returnMultiple'")

		if 'soup' in kwargs and kwargs['soup']:
			raise Exceptions.ArgumentError("getFileAndName contradicts the 'soup' directive!")

		kwargs["returnMultiple"] = True

		pgctnt, pghandle = self.getpage(*args, **kwargs)

		info = pghandle.info()
		if not 'Content-Disposition' in info:
			hName = ''
		elif not 'filename=' in info['Content-Disposition']:
			hName = ''
		else:
			hName = info['Content-Disposition'].split('filename=')[1]


		return pgctnt, hName

	def getFileNameMime(self, *args, **kwargs):
		if 'returnMultiple' in kwargs:
			raise Exceptions.ArgumentError("getFileAndName cannot be called with 'returnMultiple'")

		if 'soup' in kwargs and kwargs['soup']:
			raise Exceptions.ArgumentError("getFileAndName contradicts the 'soup' directive!")

		kwargs["returnMultiple"] = True

		pgctnt, pghandle = self.getpage(*args, **kwargs)

		info = pghandle.info()
		if not 'Content-Disposition' in info:
			hName = ''
		elif not 'filename=' in info['Content-Disposition']:
			hName = ''
		else:
			hName = info['Content-Disposition'].split('filename=')[1]

		mime = info.get_content_type()

		return pgctnt, hName, mime

	def getpage(self, requestedUrl, **kwargs):
		self.log.info("Fetching content at URL: %s", requestedUrl)

		# strip trailing and leading spaces.
		requestedUrl = requestedUrl.strip()

		# If we have 'soup' as a param, just pop it, and call `getSoup()`.
		if 'soup' in kwargs and kwargs['soup']:
			self.log.warning("'soup' kwarg is depreciated. Please use the `getSoup()` call instead.")
			kwargs.pop('soup')
			return self.getSoup(requestedUrl, **kwargs)

		# Decode the kwargs values
		addlHeaders    = kwargs.setdefault("addlHeaders",     None)
		returnMultiple = kwargs.setdefault("returnMultiple",  False)
		callBack       = kwargs.setdefault("callBack",        None)
		postData       = kwargs.setdefault("postData",        None)
		retryQuantity  = kwargs.setdefault("retryQuantity",   None)
		nativeError    = kwargs.setdefault("nativeError",     False)
		binaryForm     = kwargs.setdefault("binaryForm",      False)

		# Conditionally encode the referrer if needed, because otherwise
		# urllib will barf on unicode referrer values.
		if addlHeaders and 'Referer' in addlHeaders:
			addlHeaders['Referer'] = iri2uri.iri2uri(addlHeaders['Referer'])


		retryCount = 0
		while 1:

			pgctnt = None
			pghandle = None

			pgreq = self.__buildRequest(requestedUrl, postData, addlHeaders, binaryForm)

			errored = False
			lastErr = ""

			retryCount = retryCount + 1

			if (retryQuantity and retryCount > retryQuantity) or (not retryQuantity and retryCount > self.errorOutCount):
				self.log.error("Failed to retrieve Website : %s at %s All Attempts Exhausted", pgreq.get_full_url(), time.ctime(time.time()))
				pgctnt = None
				try:
					self.log.critical("Critical Failure to retrieve page! %s at %s, attempt %s", pgreq.get_full_url(), time.ctime(time.time()), retryCount)
					self.log.critical("Error: %s", lastErr)
					self.log.critical("Exiting")
				except:
					self.log.critical("And the URL could not be printed due to an encoding error")
				break

			#print "execution", retryCount
			try:
				# print("Getpage!", requestedUrl, kwargs)
				pghandle = self.opener.open(pgreq, timeout=30)					# Get Webpage
				# print("Gotpage")

			except urllib.error.HTTPError as e:								# Lotta logging
				self.log.warning("Error opening page: %s at %s On Attempt %s.", pgreq.get_full_url(), time.ctime(time.time()), retryCount)
				self.log.warning("Error Code: %s", e)

				#traceback.print_exc()
				lastErr = e
				try:

					self.log.warning("Original URL: %s", requestedUrl)
					errored = True
				except:
					self.log.warning("And the URL could not be printed due to an encoding error")

				if e.code == 404:
					#print "Unrecoverable - Page not found. Breaking"
					self.log.critical("Unrecoverable - Page not found. Breaking")
					break

				time.sleep(self.retryDelay)
				if e.code == 503:
					errcontent = e.read()
					if b'This process is automatic. Your browser will redirect to your requested content shortly.' in errcontent:
						self.log.warning("Cloudflare failure! Doing automatic step-through.")
						self.stepThroughCloudFlare(requestedUrl, titleNotContains="Just a moment...")
			except UnicodeEncodeError:
				self.log.critical("Unrecoverable Unicode issue retreiving page - %s", requestedUrl)
				for line in traceback.format_exc().split("\n"):
					self.log.critical("%s", line.rstrip())
				self.log.critical("Parameters:")
				self.log.critical("	requestedUrl: '%s'", requestedUrl)
				self.log.critical("	postData:     '%s'", postData)
				self.log.critical("	addlHeaders:  '%s'", addlHeaders)
				self.log.critical("	binaryForm:   '%s'", binaryForm)

				break

			except Exception:
				errored = True
				#traceback.print_exc()
				lastErr = sys.exc_info()
				self.log.warning("Retreival failed. Traceback:")
				self.log.warning(str(lastErr))
				self.log.warning(traceback.format_exc())

				self.log.warning("Error Retrieving Page! - Trying again - Waiting %s seconds", self.retryDelay)

				try:
					self.log.critical("Error on page - %s", requestedUrl)
				except:
					self.log.critical("And the URL could not be printed due to an encoding error")

				time.sleep(self.retryDelay)

				continue

			if pghandle != None:
				self.log.info("Request for URL: %s succeeded at %s On Attempt %s. Recieving...", pgreq.get_full_url(), time.ctime(time.time()), retryCount)
				pgctnt = self.__retreiveContent(pgreq, pghandle, callBack)

				# if __retreiveContent did not return false, it managed to fetch valid results, so break
				if pgctnt != False:
					break

		if errored and pghandle != None:
			print(("Later attempt succeeded %s" % pgreq.get_full_url()))
		elif (errored or not pgctnt) and pghandle is None:

			if lastErr and nativeError:
				raise lastErr
			raise Exceptions.FetchFailureError("Failed to retreive page '%s'!" % (requestedUrl, ))

		if returnMultiple:

			return pgctnt, pghandle
		else:
			return pgctnt

	def getItem(self, itemUrl):

		try:
			content, handle = self.getpage(itemUrl, returnMultiple=True)
		except:
			print("Failure?")
			if self.rules['cloudflare']:
				if not self.stepThroughCloudFlare(itemUrl, titleNotContains='Just a moment...'):
					raise Exceptions.FetchFailureError("Could not step through cloudflare!")
				# Cloudflare cookie set, retrieve again
				content, handle = self.getpage(itemUrl, returnMultiple=True)
			else:
				raise

		if not content or not handle:
			raise urllib.error.URLError("Failed to retreive file from page '%s'!" % itemUrl)

		fileN = urllib.parse.unquote(urllib.parse.urlparse(handle.geturl())[2].split("/")[-1])
		fileN = bs4.UnicodeDammit(fileN).unicode_markup
		mType = handle.info()['Content-Type']

		# If there is an encoding in the content-type (or any other info), strip it out.
		# We don't care about the encoding, since WebFunctions will already have handled that,
		# and returned a decoded unicode object.
		if mType and ";" in mType:
			mType = mType.split(";")[0].strip()

		# *sigh*. So minus.com is fucking up their http headers, and apparently urlencoding the
		# mime type, because apparently they're shit at things.
		# Anyways, fix that.
		if '%2F' in  mType:
			mType = mType.replace('%2F', '/')

		self.log.info("Retreived file of type '%s', name of '%s' with a size of %0.3f K", mType, fileN, len(content)/1000.0)
		return content, fileN, mType

	def getHead(self, url, addlHeaders=None):
		for x in range(9999):
			try:
				self.log.info("Doing HTTP HEAD request for '%s'", url)
				pgreq = self.__buildRequest(url, None, addlHeaders, None, req_class=Handlers.HeadRequest)
				pghandle = self.opener.open(pgreq, timeout=30)
				returl = pghandle.geturl()
				if returl != url:
					self.log.info("HEAD request returned a different URL '%s'", returl)

				return returl
			except socket.timeout as e:
				self.log.info("Timeout, retrying....")
				if x >= 3:
					self.log.error("Failure fetching: %s", url)
					raise Exceptions.FetchFailureError("Timout when fetching %s. Error: %s" % (url, e))
			except urllib.error.URLError as e:
				# Continue even in the face of cloudflare crapping it's pants
				if e.code == 500 and e.geturl():
					return e.geturl()
				self.log.info("URLError, retrying....")
				if x >= 3:
					self.log.error("Failure fetching: %s", url)
					raise Exceptions.FetchFailureError("URLError when fetching %s. Error: %s" % (url, e))

	######################################################################################################################################################
	######################################################################################################################################################

	def __decodeHtml(self, pageContent, cType):

		# this *should* probably be done using a parser.
		# However, it seems to be grossly overkill to shove the whole page (which can be quite large) through a parser just to pull out a tag that
		# should be right near the page beginning anyways.
		# As such, it's a regular expression for the moment

		# Regex is of bytes type, since we can't convert a string to unicode until we know the encoding the
		# bytes string is using, and we need the regex to get that encoding
		coding = re.search(rb"charset=[\'\"]?([a-zA-Z0-9\-]*)[\'\"]?", pageContent, flags=re.IGNORECASE)

		cType = b""
		charset = None
		try:
			if coding:
				cType = coding.group(1)
				codecs.lookup(cType.decode("ascii"))
				charset = cType.decode("ascii")

		except LookupError:

			# I'm actually not sure what I was thinking when I wrote this if statement. I don't think it'll ever trigger.
			if (b";" in cType) and (b"=" in cType): 		# the server is reporting an encoding. Now we use it to decode the

				dummy_docType, charset = cType.split(b";")
				charset = charset.split(b"=")[-1]

		if not charset:
			self.log.warning("Could not find encoding information on page - Using default charset. Shit may break!")
			charset = "iso-8859-1"

		try:
			pageContent = str(pageContent, charset)

		except UnicodeDecodeError:
			self.log.error("Encoding Error! Stripping invalid chars.")
			pageContent = pageContent.decode('utf-8', errors='ignore')

		return pageContent

	def __buildRequest(self, pgreq, postData, addlHeaders, binaryForm, req_class = None):
		if req_class is None:
			req_class = urllib.request.Request

		pgreq = iri2uri.iri2uri(pgreq)

		try:
			params = {}
			headers = {}
			if postData != None:
				self.log.info("Making a post-request! Params: '%s'", postData)
				params['data'] = urllib.parse.urlencode(postData).encode("utf-8")
			if addlHeaders != None:
				self.log.info("Have additional GET parameters!")
				for key, parameter in addlHeaders.items():
					self.log.info("	Item: '%s' -> '%s'", key, parameter)
				headers = addlHeaders
			if binaryForm:
				self.log.info("Binary form submission!")
				if 'data' in params:
					raise Exceptions.ArgumentError("You cannot make a binary form post and a plain post request at the same time!")

				params['data']            = binaryForm.make_result()
				headers['Content-type']   =  binaryForm.get_content_type()
				headers['Content-length'] =  len(params['data'])

			return req_class(pgreq, headers=headers, **params)

		except:
			self.log.critical("Invalid header or url")
			raise

	def __decompressContent(self, coding, pgctnt):
		#preLen = len(pgctnt)
		if coding == 'deflate':
			compType = "deflate"

			pgctnt = zlib.decompress(pgctnt, -zlib.MAX_WBITS)

		elif coding == 'gzip':
			compType = "gzip"

			buf = io.BytesIO(pgctnt)
			f = gzip.GzipFile(fileobj=buf)
			pgctnt = f.read()

		elif coding == "sdch":
			raise Exceptions.ContentTypeError("Wait, someone other then google actually supports SDCH compression?")

		else:
			compType = "none"

		return compType, pgctnt

	def __decodeTextContent(self, pgctnt, cType):

		if cType:
			if (";" in cType) and ("=" in cType):
				# the server is reporting an encoding. Now we use it to decode the content
				# Some wierdos put two charsets in their headers:
				# `text/html;Charset=UTF-8;charset=UTF-8`
				# Split, and take the first two entries.
				docType, charset = cType.split(";")[:2]
				charset = charset.split("=")[-1]

				# Only decode content marked as text (yeah, google is serving zip files
				# with the content-disposition charset header specifying "UTF-8") or
				# specifically allowed other content types I know are really text.
				decode = ['application/atom+xml', 'application/xml', "application/json", 'text']
				if any([item in docType for item in decode]):
					try:
						pgctnt = str(pgctnt, charset)
					except UnicodeDecodeError:
						self.log.error("Encoding Error! Stripping invalid chars.")
						pgctnt = pgctnt.decode('utf-8', errors='ignore')

			else:
				# The server is not reporting an encoding in the headers.
				# Use content-aware mechanisms for determing the content encoding.


				if "text/html" in cType or \
					'text/javascript' in cType or    \
					'text/css' in cType or    \
					'application/xml' in cType or    \
					'application/atom+xml' in cType:				# If this is a html/text page, we want to decode it using the local encoding

					pgctnt = self.__decodeHtml(pgctnt, cType)

				elif "text/plain" in cType or "text/xml" in cType:
					pgctnt = bs4.UnicodeDammit(pgctnt).unicode_markup

				# Assume JSON is utf-8. Probably a bad idea?
				elif "application/json" in cType:
					pgctnt = pgctnt.decode('utf-8')

				elif "text" in cType:
					self.log.critical("Unknown content type!")
					self.log.critical(cType)

		else:
			self.log.critical("No content disposition header!")
			self.log.critical("Cannot guess content type!")

		return pgctnt

	def __retreiveContent(self, pgreq, pghandle, callBack):
		try:
			# If we have a progress callback, call it for chunked read.
			# Otherwise, just read in the entire content.
			if callBack:
				pgctnt = self.__chunkRead(pghandle, 2 ** 17, reportHook=callBack)
			else:
				pgctnt = pghandle.read()


			if pgctnt is None:
				return False

			self.log.info("URL fully retrieved.")

			preDecompSize = len(pgctnt)/1000.0

			encoded = pghandle.headers.get('Content-Encoding')
			compType, pgctnt = self.__decompressContent(encoded, pgctnt)


			decompSize = len(pgctnt)/1000.0
			# self.log.info("Page content type = %s", type(pgctnt))
			cType = pghandle.headers.get("Content-Type")
			if compType == 'none':
				self.log.info("Compression type = %s. Content Size = %0.3fK. File type: %s.", compType, decompSize, cType)
			else:
				self.log.info("Compression type = %s. Content Size compressed = %0.3fK. Decompressed = %0.3fK. File type: %s.", compType, preDecompSize, decompSize, cType)

			pgctnt = self.__decodeTextContent(pgctnt, cType)

			return pgctnt

		except:
			print("pghandle = ", pghandle)

			self.log.error(sys.exc_info())
			traceback.print_exc()
			self.log.error("Error Retrieving Page! - Transfer failed. Waiting %s seconds before retrying", self.retryDelay)

			try:
				self.log.critical("Critical Failure to retrieve page! %s at %s", pgreq.get_full_url(), time.ctime(time.time()))
				self.log.critical("Exiting")
			except:
				self.log.critical("And the URL could not be printed due to an encoding error")
			print()
			self.log.error(pghandle)
			time.sleep(self.retryDelay)

		return False


		# HUGE GOD-FUNCTION.
		# OH GOD FIXME.

		# postData expects a dict
		# addlHeaders also expects a dict

	######################################################################################################################################################
	######################################################################################################################################################

	def __loadCookies(self):

		if self.alt_cookiejar is not None:
			self.alt_cookiejar.init_agent(new_headers=self.browserHeaders)
			self.cj = self.alt_cookiejar
		else:
			self.cj = http.cookiejar.LWPCookieJar()		# This is a subclass of FileCookieJar
												# that has useful load and save methods
		if self.cj is not None:
			if os.path.isfile(self.COOKIEFILE):
				try:
					self.__updateCookiesFromFile()
					# self.log.info("Loading CookieJar")
				except:
					self.log.critical("Cookie file is corrupt/damaged?")
					try:
						os.remove(self.COOKIEFILE)
					except FileNotFoundError:
						pass
			if http.cookiejar is not None:
				# self.log.info("Installing CookieJar")
				self.log.debug(self.cj)
				cookieHandler = urllib.request.HTTPCookieProcessor(self.cj)
				args = (cookieHandler, Handlers.HTTPRedirectHandler)
				if self.credHandler:
					print("Have cred handler. Building opener using it")
					args += (self.credHandler, )
				if self.use_socks:
					print("Using Socks handler")
					if not HAVE_SOCKS:
						raise RuntimeError("SOCKS Use specified, and no socks installed!")
					args = (SocksiPyHandler(socks.SOCKS5, "127.0.0.1", 9050), ) + args

				self.opener = urllib.request.build_opener(*args)
				#self.opener.addheaders = [('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')]
				self.opener.addheaders = self.browserHeaders
				#urllib2.install_opener(self.opener)

		for cookie in self.cj:
			self.log.debug(cookie)
			#print cookie

	def __syncCookiesFromFile(self):
		# self.log.info("Synchronizing cookies with cookieFile.")
		if os.path.isfile(self.COOKIEFILE):
			self.cj.save("cookietemp.lwp")
			self.cj.load(self.COOKIEFILE)
			self.cj.load("cookietemp.lwp")
		# First, load any changed cookies so we don't overwrite them
		# However, we want to persist any cookies that we have that are more recent then the saved cookies, so we temporarily save
		# the cookies in memory to a temp-file, then load the cookiefile, and finally overwrite the loaded cookies with the ones from the
		# temp file

	def __updateCookiesFromFile(self):
		if os.path.exists(self.COOKIEFILE):
			# self.log.info("Synchronizing cookies with cookieFile.")
			self.cj.load(self.COOKIEFILE)
		# Update cookies from cookiefile

	def addCookie(self, inCookie):
		self.log.info("Updating cookie!")
		self.cj.set_cookie(inCookie)

	def saveCookies(self, halting=False):

		locked = self.cookie_lock.acquire(timeout=5)
		if not locked:
			self.log.error("Failed to acquire cookie-lock!")
			return

		# print("Have %d cookies before saving cookiejar" % len(self.cj))
		try:
			# self.log.info("Trying to save cookies!")
			if self.cj is not None:							# If cookies were used

				self.__syncCookiesFromFile()

				# self.log.info("Have cookies to save")
				for cookie in self.cj:
					# print(cookie)
					# print(cookie.expires)

					if isinstance(cookie.expires, int) and cookie.expires > 30000000000:		# Clamp cookies that expire stupidly far in the future because people are assholes
						cookie.expires = 30000000000

				# self.log.info("Calling save function")
				self.cj.save(self.COOKIEFILE)					# save the cookies again


				# self.log.info("Cookies Saved")
			else:
				self.log.info("No cookies to save?")
		except Exception as e:
			pass
			# The destructor call order is too incoherent, and shit fails
			# during the teardown with null-references. The error printout is
			# not informative, so just silence it.
			# print("Possible error on exit (or just the destructor): '%s'." % e)
		finally:
			self.cookie_lock.release()

		# print("Have %d cookies after saving cookiejar" % len(self.cj))
		if not halting:
			self.__syncCookiesFromFile()
		# print "Have %d cookies after reloading cookiejar" % len(self.cj)

	def getCookies(self):

		locked = self.cookie_lock.acquire(timeout=5)
		if not locked:
			raise RuntimeError("Could not acquire lock on cookiejar")

		try:
			# self.log.info("Trying to save cookies!")
			if self.cj is not None:							# If cookies were used
				self.__syncCookiesFromFile()
		finally:
			self.cookie_lock.release()

		return self.cj

	######################################################################################################################################################
	######################################################################################################################################################

	def __del__(self):
		# print "WGH Destructor called!"
		# print("WebRequest __del__")
		self.saveCookies(halting=True)

		sup = super()
		if hasattr(sup, '__del__'):
			sup.__del__()


	def stepThroughCloudFlare(self, *args, **kwargs):
		# Shim to the underlying web browser of choice

		self.stepThroughCloudFlare_pjs(*args, **kwargs)