-booruScraper/util/WebRequest/WebRequestClass.py

860 lines
28 KiB
Python

#!/usr/bin/python3
import urllib.request
import urllib.parse
import urllib.error
import os.path
import time
import http.cookiejar
import traceback
import logging
import zlib
import codecs
import re
import sys
import gzip
import io
import socket
import json
from threading import Lock
import bs4
try:
import socks
from sockshandler import SocksiPyHandler
HAVE_SOCKS = True
except ImportError:
HAVE_SOCKS = False
from . import HeaderParseMonkeyPatch
from . import ChromiumMixin
from . import PhantomJSMixin
from . import Handlers
from . import iri2uri
from . import Constants
from . import Exceptions
#pylint: disable-msg=E1101, C0325, R0201, W0702, W0703
COOKIEWRITELOCK = Lock()
GLOBAL_COOKIE_FILE = None
def as_soup(str):
return bs4.BeautifulSoup(str, "lxml")
def determine_json_encoding(json_bytes):
'''
Given the fact that the first 2 characters in json are guaranteed to be ASCII, we can use
these to determine the encoding.
See: http://tools.ietf.org/html/rfc4627#section-3
Copied here:
Since the first two characters of a JSON text will always be ASCII
characters [RFC0020], it is possible to determine whether an octet
stream is UTF-8, UTF-16 (BE or LE), or UTF-32 (BE or LE) by looking
at the pattern of nulls in the first four octets.
00 00 00 xx UTF-32BE
00 xx 00 xx UTF-16BE
xx 00 00 00 UTF-32LE
xx 00 xx 00 UTF-16LE
xx xx xx xx UTF-8
'''
assert(isinstance(json_bytes, bytes))
if len(json_bytes) > 4:
b1, b2, b3, b4 = json_bytes[0], json_bytes[1], json_bytes[2], json_bytes[3]
if b1 == 0 and b2 == 0 and b3 == 0 and b4 != 0:
return "UTF-32BE"
elif b1 == 0 and b2 != 0 and b3 == 0 and b4 != 0:
return "UTF-16BE"
elif b1 != 0 and b2 == 0 and b3 == 0 and b4 == 0:
return "UTF-32LE"
elif b1 != 0 and b2 == 0 and b3 != 0 and b4 == 0:
return "UTF-16LE"
elif b1 != 0 and b2 != 0 and b3 != 0 and b4 != 0:
return "UTF-8"
else:
raise Exceptions.ContentTypeError("Unknown encoding!")
elif len(json_bytes) > 2:
b1, b2 = json_bytes[0], json_bytes[1]
if b1 == 0 and b2 == 0:
return "UTF-32BE"
elif b1 == 0 and b2 != 0:
return "UTF-16BE"
elif b1 != 0 and b2 == 0:
raise Exceptions.ContentTypeError("Json string too short to definitively infer encoding.")
elif b1 != 0 and b2 != 0:
return "UTF-8"
else:
raise Exceptions.ContentTypeError("Unknown encoding!")
raise Exceptions.ContentTypeError("Input string too short to guess encoding!")
# A urllib2 wrapper that provides error handling and logging, as well as cookie management. It's a bit crude, but it works.
# Also supports transport compresion.
# OOOOLLLLLLDDDDD, has lots of creaky internals. Needs some cleanup desperately, but lots of crap depends on almost everything.
# Arrrgh.
class WebGetRobust(PhantomJSMixin.WebGetPjsMixin, ChromiumMixin.WebGetCrMixin):
COOKIEFILE = 'cookies.lwp' # the path and filename to save your cookies in
cj = None
cookielib = None
opener = None
errorOutCount = 2
# retryDelay = 0.1
retryDelay = 0.01
data = None
# creds is a list of 3-tuples that gets inserted into the password manager.
# it is structured [(top_level_url1, username1, password1), (top_level_url2, username2, password2)]
def __init__(self, creds=None, logPath="Main.WebRequest", cookie_lock=None, cloudflare=False, use_socks=False, alt_cookiejar=None):
super().__init__()
self.rules = {}
self.rules['cloudflare'] = cloudflare
if cookie_lock:
self.cookie_lock = cookie_lock
else:
self.cookie_lock = COOKIEWRITELOCK
self.use_socks = use_socks
# Override the global default socket timeout, so hung connections will actually time out properly.
socket.setdefaulttimeout(5)
self.log = logging.getLogger(logPath)
# print("Webget init! Logpath = ", logPath)
if creds:
print("Have creds for a domain")
# Due to general internet people douchebaggyness, I've basically said to hell with it and decided to spoof a whole assortment of browsers
# It should keep people from blocking this scraper *too* easily
self.browserHeaders = Constants.getUserAgent()
self.data = urllib.parse.urlencode(self.browserHeaders)
if creds:
print("Have credentials, installing password manager into urllib handler.")
passManager = urllib.request.HTTPPasswordMgrWithDefaultRealm()
for url, username, password in creds:
passManager.add_password(None, url, username, password)
self.credHandler = Handlers.PreemptiveBasicAuthHandler(passManager)
else:
self.credHandler = None
self.alt_cookiejar = alt_cookiejar
self.__loadCookies()
def chunkReport(self, bytesSoFar, totalSize):
if totalSize:
percent = float(bytesSoFar) / totalSize
percent = round(percent * 100, 2)
self.log.info("Downloaded %d of %d bytes (%0.2f%%)" % (bytesSoFar, totalSize, percent))
else:
self.log.info("Downloaded %d bytes" % (bytesSoFar))
def __chunkRead(self, response, chunkSize=2 ** 18, reportHook=None):
contentLengthHeader = response.info().getheader('Content-Length')
if contentLengthHeader:
totalSize = contentLengthHeader.strip()
totalSize = int(totalSize)
else:
totalSize = None
bytesSoFar = 0
pgContent = ""
while 1:
chunk = response.read(chunkSize)
pgContent += chunk
bytesSoFar += len(chunk)
if not chunk:
break
if reportHook:
reportHook(bytesSoFar, chunkSize, totalSize)
return pgContent
def getSoupNoRedirects(self, *args, **kwargs):
if 'returnMultiple' in kwargs:
raise Exceptions.ArgumentError("getSoup cannot be called with 'returnMultiple'")
if 'soup' in kwargs and kwargs['soup']:
raise Exceptions.ArgumentError("getSoup contradicts the 'soup' directive!")
kwargs['returnMultiple'] = True
tgt_url = kwargs.get('requestedUrl', None)
if not tgt_url:
tgt_url = args[0]
page, handle = self.getpage(*args, **kwargs)
redirurl = handle.geturl()
if redirurl != tgt_url:
self.log.error("Requested %s, redirected to %s. Raising error", tgt_url, redirurl)
raise Exceptions.RedirectedError("Requested %s, redirected to %s" % (
tgt_url, redirurl))
soup = as_soup(page)
return soup
def getSoup(self, *args, **kwargs):
if 'returnMultiple' in kwargs and kwargs['returnMultiple']:
raise Exceptions.ArgumentError("getSoup cannot be called with 'returnMultiple' being true")
if 'soup' in kwargs and kwargs['soup']:
raise Exceptions.ArgumentError("getSoup contradicts the 'soup' directive!")
page = self.getpage(*args, **kwargs)
if isinstance(page, bytes):
raise Exceptions.ContentTypeError("Received content not decoded! Cannot parse!")
soup = as_soup(page)
return soup
def getJson(self, *args, **kwargs):
if 'returnMultiple' in kwargs and kwargs['returnMultiple']:
raise Exceptions.ArgumentError("getSoup cannot be called with 'returnMultiple' being true")
attempts = 0
while 1:
try:
page = self.getpage(*args, **kwargs)
if isinstance(page, bytes):
page = page.decode(determine_json_encoding(page))
# raise ValueError("Received content not decoded! Cannot parse!")
page = page.strip()
ret = json.loads(page)
return ret
except ValueError:
if attempts < 1:
attempts += 1
self.log.error("JSON Parsing issue retreiving content from page!")
for line in traceback.format_exc().split("\n"):
self.log.error("%s", line.rstrip())
self.log.error("Retrying!")
# Scramble our current UA
self.browserHeaders = Constants.getUserAgent()
if self.alt_cookiejar:
self.cj.init_agent(new_headers=self.browserHeaders)
time.sleep(self.retryDelay)
else:
self.log.error("JSON Parsing issue, and retries exhausted!")
# self.log.error("Page content:")
# self.log.error(page)
# with open("Error-ctnt-{}.json".format(time.time()), "w") as tmp_err_fp:
# tmp_err_fp.write(page)
raise
def getFileAndName(self, *args, **kwargs):
if 'returnMultiple' in kwargs:
raise Exceptions.ArgumentError("getFileAndName cannot be called with 'returnMultiple'")
if 'soup' in kwargs and kwargs['soup']:
raise Exceptions.ArgumentError("getFileAndName contradicts the 'soup' directive!")
kwargs["returnMultiple"] = True
pgctnt, pghandle = self.getpage(*args, **kwargs)
info = pghandle.info()
if not 'Content-Disposition' in info:
hName = ''
elif not 'filename=' in info['Content-Disposition']:
hName = ''
else:
hName = info['Content-Disposition'].split('filename=')[1]
return pgctnt, hName
def getFileNameMime(self, *args, **kwargs):
if 'returnMultiple' in kwargs:
raise Exceptions.ArgumentError("getFileAndName cannot be called with 'returnMultiple'")
if 'soup' in kwargs and kwargs['soup']:
raise Exceptions.ArgumentError("getFileAndName contradicts the 'soup' directive!")
kwargs["returnMultiple"] = True
pgctnt, pghandle = self.getpage(*args, **kwargs)
info = pghandle.info()
if not 'Content-Disposition' in info:
hName = ''
elif not 'filename=' in info['Content-Disposition']:
hName = ''
else:
hName = info['Content-Disposition'].split('filename=')[1]
mime = info.get_content_type()
return pgctnt, hName, mime
def getpage(self, requestedUrl, **kwargs):
self.log.info("Fetching content at URL: %s", requestedUrl)
# strip trailing and leading spaces.
requestedUrl = requestedUrl.strip()
# If we have 'soup' as a param, just pop it, and call `getSoup()`.
if 'soup' in kwargs and kwargs['soup']:
self.log.warning("'soup' kwarg is depreciated. Please use the `getSoup()` call instead.")
kwargs.pop('soup')
return self.getSoup(requestedUrl, **kwargs)
# Decode the kwargs values
addlHeaders = kwargs.setdefault("addlHeaders", None)
returnMultiple = kwargs.setdefault("returnMultiple", False)
callBack = kwargs.setdefault("callBack", None)
postData = kwargs.setdefault("postData", None)
retryQuantity = kwargs.setdefault("retryQuantity", None)
nativeError = kwargs.setdefault("nativeError", False)
binaryForm = kwargs.setdefault("binaryForm", False)
# Conditionally encode the referrer if needed, because otherwise
# urllib will barf on unicode referrer values.
if addlHeaders and 'Referer' in addlHeaders:
addlHeaders['Referer'] = iri2uri.iri2uri(addlHeaders['Referer'])
retryCount = 0
while 1:
pgctnt = None
pghandle = None
pgreq = self.__buildRequest(requestedUrl, postData, addlHeaders, binaryForm)
errored = False
lastErr = ""
retryCount = retryCount + 1
if (retryQuantity and retryCount > retryQuantity) or (not retryQuantity and retryCount > self.errorOutCount):
self.log.error("Failed to retrieve Website : %s at %s All Attempts Exhausted", pgreq.get_full_url(), time.ctime(time.time()))
pgctnt = None
try:
self.log.critical("Critical Failure to retrieve page! %s at %s, attempt %s", pgreq.get_full_url(), time.ctime(time.time()), retryCount)
self.log.critical("Error: %s", lastErr)
self.log.critical("Exiting")
except:
self.log.critical("And the URL could not be printed due to an encoding error")
break
#print "execution", retryCount
try:
# print("Getpage!", requestedUrl, kwargs)
pghandle = self.opener.open(pgreq, timeout=30) # Get Webpage
# print("Gotpage")
except urllib.error.HTTPError as e: # Lotta logging
self.log.warning("Error opening page: %s at %s On Attempt %s.", pgreq.get_full_url(), time.ctime(time.time()), retryCount)
self.log.warning("Error Code: %s", e)
#traceback.print_exc()
lastErr = e
try:
self.log.warning("Original URL: %s", requestedUrl)
errored = True
except:
self.log.warning("And the URL could not be printed due to an encoding error")
if e.code == 404:
#print "Unrecoverable - Page not found. Breaking"
self.log.critical("Unrecoverable - Page not found. Breaking")
break
time.sleep(self.retryDelay)
if e.code == 503:
errcontent = e.read()
if b'This process is automatic. Your browser will redirect to your requested content shortly.' in errcontent:
self.log.warning("Cloudflare failure! Doing automatic step-through.")
self.stepThroughCloudFlare(requestedUrl, titleNotContains="Just a moment...")
except UnicodeEncodeError:
self.log.critical("Unrecoverable Unicode issue retreiving page - %s", requestedUrl)
for line in traceback.format_exc().split("\n"):
self.log.critical("%s", line.rstrip())
self.log.critical("Parameters:")
self.log.critical(" requestedUrl: '%s'", requestedUrl)
self.log.critical(" postData: '%s'", postData)
self.log.critical(" addlHeaders: '%s'", addlHeaders)
self.log.critical(" binaryForm: '%s'", binaryForm)
break
except Exception:
errored = True
#traceback.print_exc()
lastErr = sys.exc_info()
self.log.warning("Retreival failed. Traceback:")
self.log.warning(str(lastErr))
self.log.warning(traceback.format_exc())
self.log.warning("Error Retrieving Page! - Trying again - Waiting %s seconds", self.retryDelay)
try:
self.log.critical("Error on page - %s", requestedUrl)
except:
self.log.critical("And the URL could not be printed due to an encoding error")
time.sleep(self.retryDelay)
continue
if pghandle != None:
self.log.info("Request for URL: %s succeeded at %s On Attempt %s. Recieving...", pgreq.get_full_url(), time.ctime(time.time()), retryCount)
pgctnt = self.__retreiveContent(pgreq, pghandle, callBack)
# if __retreiveContent did not return false, it managed to fetch valid results, so break
if pgctnt != False:
break
if errored and pghandle != None:
print(("Later attempt succeeded %s" % pgreq.get_full_url()))
elif (errored or not pgctnt) and pghandle is None:
if lastErr and nativeError:
raise lastErr
raise Exceptions.FetchFailureError("Failed to retreive page '%s'!" % (requestedUrl, ))
if returnMultiple:
return pgctnt, pghandle
else:
return pgctnt
def getItem(self, itemUrl):
try:
content, handle = self.getpage(itemUrl, returnMultiple=True)
except:
print("Failure?")
if self.rules['cloudflare']:
if not self.stepThroughCloudFlare(itemUrl, titleNotContains='Just a moment...'):
raise Exceptions.FetchFailureError("Could not step through cloudflare!")
# Cloudflare cookie set, retrieve again
content, handle = self.getpage(itemUrl, returnMultiple=True)
else:
raise
if not content or not handle:
raise urllib.error.URLError("Failed to retreive file from page '%s'!" % itemUrl)
fileN = urllib.parse.unquote(urllib.parse.urlparse(handle.geturl())[2].split("/")[-1])
fileN = bs4.UnicodeDammit(fileN).unicode_markup
mType = handle.info()['Content-Type']
# If there is an encoding in the content-type (or any other info), strip it out.
# We don't care about the encoding, since WebFunctions will already have handled that,
# and returned a decoded unicode object.
if mType and ";" in mType:
mType = mType.split(";")[0].strip()
# *sigh*. So minus.com is fucking up their http headers, and apparently urlencoding the
# mime type, because apparently they're shit at things.
# Anyways, fix that.
if '%2F' in mType:
mType = mType.replace('%2F', '/')
self.log.info("Retreived file of type '%s', name of '%s' with a size of %0.3f K", mType, fileN, len(content)/1000.0)
return content, fileN, mType
def getHead(self, url, addlHeaders=None):
for x in range(9999):
try:
self.log.info("Doing HTTP HEAD request for '%s'", url)
pgreq = self.__buildRequest(url, None, addlHeaders, None, req_class=Handlers.HeadRequest)
pghandle = self.opener.open(pgreq, timeout=30)
returl = pghandle.geturl()
if returl != url:
self.log.info("HEAD request returned a different URL '%s'", returl)
return returl
except socket.timeout as e:
self.log.info("Timeout, retrying....")
if x >= 3:
self.log.error("Failure fetching: %s", url)
raise Exceptions.FetchFailureError("Timout when fetching %s. Error: %s" % (url, e))
except urllib.error.URLError as e:
# Continue even in the face of cloudflare crapping it's pants
if e.code == 500 and e.geturl():
return e.geturl()
self.log.info("URLError, retrying....")
if x >= 3:
self.log.error("Failure fetching: %s", url)
raise Exceptions.FetchFailureError("URLError when fetching %s. Error: %s" % (url, e))
######################################################################################################################################################
######################################################################################################################################################
def __decodeHtml(self, pageContent, cType):
# this *should* probably be done using a parser.
# However, it seems to be grossly overkill to shove the whole page (which can be quite large) through a parser just to pull out a tag that
# should be right near the page beginning anyways.
# As such, it's a regular expression for the moment
# Regex is of bytes type, since we can't convert a string to unicode until we know the encoding the
# bytes string is using, and we need the regex to get that encoding
coding = re.search(rb"charset=[\'\"]?([a-zA-Z0-9\-]*)[\'\"]?", pageContent, flags=re.IGNORECASE)
cType = b""
charset = None
try:
if coding:
cType = coding.group(1)
codecs.lookup(cType.decode("ascii"))
charset = cType.decode("ascii")
except LookupError:
# I'm actually not sure what I was thinking when I wrote this if statement. I don't think it'll ever trigger.
if (b";" in cType) and (b"=" in cType): # the server is reporting an encoding. Now we use it to decode the
dummy_docType, charset = cType.split(b";")
charset = charset.split(b"=")[-1]
if not charset:
self.log.warning("Could not find encoding information on page - Using default charset. Shit may break!")
charset = "iso-8859-1"
try:
pageContent = str(pageContent, charset)
except UnicodeDecodeError:
self.log.error("Encoding Error! Stripping invalid chars.")
pageContent = pageContent.decode('utf-8', errors='ignore')
return pageContent
def __buildRequest(self, pgreq, postData, addlHeaders, binaryForm, req_class = None):
if req_class is None:
req_class = urllib.request.Request
pgreq = iri2uri.iri2uri(pgreq)
try:
params = {}
headers = {}
if postData != None:
self.log.info("Making a post-request! Params: '%s'", postData)
params['data'] = urllib.parse.urlencode(postData).encode("utf-8")
if addlHeaders != None:
self.log.info("Have additional GET parameters!")
for key, parameter in addlHeaders.items():
self.log.info(" Item: '%s' -> '%s'", key, parameter)
headers = addlHeaders
if binaryForm:
self.log.info("Binary form submission!")
if 'data' in params:
raise Exceptions.ArgumentError("You cannot make a binary form post and a plain post request at the same time!")
params['data'] = binaryForm.make_result()
headers['Content-type'] = binaryForm.get_content_type()
headers['Content-length'] = len(params['data'])
return req_class(pgreq, headers=headers, **params)
except:
self.log.critical("Invalid header or url")
raise
def __decompressContent(self, coding, pgctnt):
#preLen = len(pgctnt)
if coding == 'deflate':
compType = "deflate"
pgctnt = zlib.decompress(pgctnt, -zlib.MAX_WBITS)
elif coding == 'gzip':
compType = "gzip"
buf = io.BytesIO(pgctnt)
f = gzip.GzipFile(fileobj=buf)
pgctnt = f.read()
elif coding == "sdch":
raise Exceptions.ContentTypeError("Wait, someone other then google actually supports SDCH compression?")
else:
compType = "none"
return compType, pgctnt
def __decodeTextContent(self, pgctnt, cType):
if cType:
if (";" in cType) and ("=" in cType):
# the server is reporting an encoding. Now we use it to decode the content
# Some wierdos put two charsets in their headers:
# `text/html;Charset=UTF-8;charset=UTF-8`
# Split, and take the first two entries.
docType, charset = cType.split(";")[:2]
charset = charset.split("=")[-1]
# Only decode content marked as text (yeah, google is serving zip files
# with the content-disposition charset header specifying "UTF-8") or
# specifically allowed other content types I know are really text.
decode = ['application/atom+xml', 'application/xml', "application/json", 'text']
if any([item in docType for item in decode]):
try:
pgctnt = str(pgctnt, charset)
except UnicodeDecodeError:
self.log.error("Encoding Error! Stripping invalid chars.")
pgctnt = pgctnt.decode('utf-8', errors='ignore')
else:
# The server is not reporting an encoding in the headers.
# Use content-aware mechanisms for determing the content encoding.
if "text/html" in cType or \
'text/javascript' in cType or \
'text/css' in cType or \
'application/xml' in cType or \
'application/atom+xml' in cType: # If this is a html/text page, we want to decode it using the local encoding
pgctnt = self.__decodeHtml(pgctnt, cType)
elif "text/plain" in cType or "text/xml" in cType:
pgctnt = bs4.UnicodeDammit(pgctnt).unicode_markup
# Assume JSON is utf-8. Probably a bad idea?
elif "application/json" in cType:
pgctnt = pgctnt.decode('utf-8')
elif "text" in cType:
self.log.critical("Unknown content type!")
self.log.critical(cType)
else:
self.log.critical("No content disposition header!")
self.log.critical("Cannot guess content type!")
return pgctnt
def __retreiveContent(self, pgreq, pghandle, callBack):
try:
# If we have a progress callback, call it for chunked read.
# Otherwise, just read in the entire content.
if callBack:
pgctnt = self.__chunkRead(pghandle, 2 ** 17, reportHook=callBack)
else:
pgctnt = pghandle.read()
if pgctnt is None:
return False
self.log.info("URL fully retrieved.")
preDecompSize = len(pgctnt)/1000.0
encoded = pghandle.headers.get('Content-Encoding')
compType, pgctnt = self.__decompressContent(encoded, pgctnt)
decompSize = len(pgctnt)/1000.0
# self.log.info("Page content type = %s", type(pgctnt))
cType = pghandle.headers.get("Content-Type")
if compType == 'none':
self.log.info("Compression type = %s. Content Size = %0.3fK. File type: %s.", compType, decompSize, cType)
else:
self.log.info("Compression type = %s. Content Size compressed = %0.3fK. Decompressed = %0.3fK. File type: %s.", compType, preDecompSize, decompSize, cType)
pgctnt = self.__decodeTextContent(pgctnt, cType)
return pgctnt
except:
print("pghandle = ", pghandle)
self.log.error(sys.exc_info())
traceback.print_exc()
self.log.error("Error Retrieving Page! - Transfer failed. Waiting %s seconds before retrying", self.retryDelay)
try:
self.log.critical("Critical Failure to retrieve page! %s at %s", pgreq.get_full_url(), time.ctime(time.time()))
self.log.critical("Exiting")
except:
self.log.critical("And the URL could not be printed due to an encoding error")
print()
self.log.error(pghandle)
time.sleep(self.retryDelay)
return False
# HUGE GOD-FUNCTION.
# OH GOD FIXME.
# postData expects a dict
# addlHeaders also expects a dict
######################################################################################################################################################
######################################################################################################################################################
def __loadCookies(self):
if self.alt_cookiejar is not None:
self.alt_cookiejar.init_agent(new_headers=self.browserHeaders)
self.cj = self.alt_cookiejar
else:
self.cj = http.cookiejar.LWPCookieJar() # This is a subclass of FileCookieJar
# that has useful load and save methods
if self.cj is not None:
if os.path.isfile(self.COOKIEFILE):
try:
self.__updateCookiesFromFile()
# self.log.info("Loading CookieJar")
except:
self.log.critical("Cookie file is corrupt/damaged?")
try:
os.remove(self.COOKIEFILE)
except FileNotFoundError:
pass
if http.cookiejar is not None:
# self.log.info("Installing CookieJar")
self.log.debug(self.cj)
cookieHandler = urllib.request.HTTPCookieProcessor(self.cj)
args = (cookieHandler, Handlers.HTTPRedirectHandler)
if self.credHandler:
print("Have cred handler. Building opener using it")
args += (self.credHandler, )
if self.use_socks:
print("Using Socks handler")
if not HAVE_SOCKS:
raise RuntimeError("SOCKS Use specified, and no socks installed!")
args = (SocksiPyHandler(socks.SOCKS5, "127.0.0.1", 9050), ) + args
self.opener = urllib.request.build_opener(*args)
#self.opener.addheaders = [('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')]
self.opener.addheaders = self.browserHeaders
#urllib2.install_opener(self.opener)
for cookie in self.cj:
self.log.debug(cookie)
#print cookie
def __syncCookiesFromFile(self):
# self.log.info("Synchronizing cookies with cookieFile.")
if os.path.isfile(self.COOKIEFILE):
self.cj.save("cookietemp.lwp")
self.cj.load(self.COOKIEFILE)
self.cj.load("cookietemp.lwp")
# First, load any changed cookies so we don't overwrite them
# However, we want to persist any cookies that we have that are more recent then the saved cookies, so we temporarily save
# the cookies in memory to a temp-file, then load the cookiefile, and finally overwrite the loaded cookies with the ones from the
# temp file
def __updateCookiesFromFile(self):
if os.path.exists(self.COOKIEFILE):
# self.log.info("Synchronizing cookies with cookieFile.")
self.cj.load(self.COOKIEFILE)
# Update cookies from cookiefile
def addCookie(self, inCookie):
self.log.info("Updating cookie!")
self.cj.set_cookie(inCookie)
def saveCookies(self, halting=False):
locked = self.cookie_lock.acquire(timeout=5)
if not locked:
self.log.error("Failed to acquire cookie-lock!")
return
# print("Have %d cookies before saving cookiejar" % len(self.cj))
try:
# self.log.info("Trying to save cookies!")
if self.cj is not None: # If cookies were used
self.__syncCookiesFromFile()
# self.log.info("Have cookies to save")
for cookie in self.cj:
# print(cookie)
# print(cookie.expires)
if isinstance(cookie.expires, int) and cookie.expires > 30000000000: # Clamp cookies that expire stupidly far in the future because people are assholes
cookie.expires = 30000000000
# self.log.info("Calling save function")
self.cj.save(self.COOKIEFILE) # save the cookies again
# self.log.info("Cookies Saved")
else:
self.log.info("No cookies to save?")
except Exception as e:
pass
# The destructor call order is too incoherent, and shit fails
# during the teardown with null-references. The error printout is
# not informative, so just silence it.
# print("Possible error on exit (or just the destructor): '%s'." % e)
finally:
self.cookie_lock.release()
# print("Have %d cookies after saving cookiejar" % len(self.cj))
if not halting:
self.__syncCookiesFromFile()
# print "Have %d cookies after reloading cookiejar" % len(self.cj)
def getCookies(self):
locked = self.cookie_lock.acquire(timeout=5)
if not locked:
raise RuntimeError("Could not acquire lock on cookiejar")
try:
# self.log.info("Trying to save cookies!")
if self.cj is not None: # If cookies were used
self.__syncCookiesFromFile()
finally:
self.cookie_lock.release()
return self.cj
######################################################################################################################################################
######################################################################################################################################################
def __del__(self):
# print "WGH Destructor called!"
# print("WebRequest __del__")
self.saveCookies(halting=True)
sup = super()
if hasattr(sup, '__del__'):
sup.__del__()
def stepThroughCloudFlare(self, *args, **kwargs):
# Shim to the underlying web browser of choice
self.stepThroughCloudFlare_pjs(*args, **kwargs)