From e60ecff007deb0b9eb8274a401fc6ab5903bc1d1 Mon Sep 17 00:00:00 2001 From: Fake-Name Date: Sun, 22 Apr 2018 20:49:13 -0700 Subject: [PATCH] Fixing stuff. Apparently I somehow completely fucked up the xbooru fetcher. Wat. --- main.py | 9 +- scraper/conf_validate.py | 65 ++ scraper/fetchBase.py | 34 +- scraper/modules/KonaChanFetch.py | 17 +- scraper/modules/danbooruFetch.py | 19 +- scraper/modules/e621Scrape.py | 15 +- scraper/modules/gelbooruFetch.py | 17 +- scraper/modules/r34xxxScrape.py | 20 +- scraper/modules/tbibFetch.py | 18 +- scraper/modules/xbooruFetch.py | 20 +- scraper/runner.py | 38 +- settings.py | 2 +- util/WebRequest/ChromiumMixin.py | 252 ------- util/WebRequest/Constants.py | 518 ------------- util/WebRequest/Exceptions.py | 18 - util/WebRequest/Handlers.py | 147 ---- util/WebRequest/HeaderParseMonkeyPatch.py | 89 --- util/WebRequest/PhantomJSMixin.py | 277 ------- util/WebRequest/WebRequestClass.py | 859 ---------------------- util/WebRequest/__init__.py | 10 - util/WebRequest/iri2uri.py | 75 -- util/WebRequest/tests/__init__.py | 1 - util/WebRequest/tests/test_chromium.py | 192 ----- util/WebRequest/tests/test_phantom.py | 144 ---- util/WebRequest/tests/test_simple.py | 330 --------- 25 files changed, 208 insertions(+), 2978 deletions(-) create mode 100644 scraper/conf_validate.py delete mode 100644 util/WebRequest/ChromiumMixin.py delete mode 100644 util/WebRequest/Constants.py delete mode 100644 util/WebRequest/Exceptions.py delete mode 100644 util/WebRequest/Handlers.py delete mode 100644 util/WebRequest/HeaderParseMonkeyPatch.py delete mode 100644 util/WebRequest/PhantomJSMixin.py delete mode 100644 util/WebRequest/WebRequestClass.py delete mode 100644 util/WebRequest/__init__.py delete mode 100644 util/WebRequest/iri2uri.py delete mode 100644 util/WebRequest/tests/__init__.py delete mode 100644 util/WebRequest/tests/test_chromium.py delete mode 100644 util/WebRequest/tests/test_phantom.py delete mode 100644 util/WebRequest/tests/test_simple.py diff --git a/main.py b/main.py index e004f7f..6f722a9 100644 --- a/main.py +++ b/main.py @@ -1,9 +1,14 @@ - +import sys import util.logSetup import scraper.runner +import scraper.conf_validate if __name__ == '__main__': util.logSetup.initLogging() - scraper.runner.go() + + if "probe" in sys.argv: + scraper.conf_validate.go() + else: + scraper.runner.go() diff --git a/scraper/conf_validate.py b/scraper/conf_validate.py new file mode 100644 index 0000000..4580730 --- /dev/null +++ b/scraper/conf_validate.py @@ -0,0 +1,65 @@ + +import logging +import settings +import tqdm +import os.path + +from sqlalchemy.sql.expression import func + +import scraper.database as db + +class TestEngine(): + def __init__(self): + self.log = logging.getLogger("Main.Runner") + def check_dir(self): + self.log.info("Checking dir") + dir_ok1 = os.path.exists(settings.storeDir) + dir_ok2 = os.path.isdir(settings.storeDir) + if not dir_ok1: + self.log.error("Download dir (%s) doesn't exist!", settings.storeDir) + elif not dir_ok2: + self.log.error("Download dir (%s) isn't a folder!", settings.storeDir) + else: + self.log.info("Download dir appears to be OK") + + dirc = os.listdir(settings.storeDir) + if not all([len(tmp) == 3 for tmp in dirc]): + self.log.error("There appears to be a non-hash-derived filename in the download store dir!") + else: + self.log.info("Download store dir looks like it's contents are valid!") + + + def check_db(self): + session = db.session() + + self.log.info("Loading rows from DB") + test_items = db.session.query(db.Files.filepath) \ + .order_by(func.random()) \ + .limit(10000) \ + .all() + + self.log.info("Selected %s rows.", len(test_items)) + + had_bad = False + for filepath, in tqdm.tqdm(test_items): + fqp = os.path.join(settings.storeDir, filepath) + if not os.path.exists(fqp): + self.log.error("File that should exists is missing (%s, %s, %s)!", filepath, fqp, os.path.exists(fqp)) + had_bad = True + + if had_bad: + self.log.error("Had a missing file!") + else: + self.log.info("All checked files passed validation!") + + + + def run(self): + self.check_dir() + self.check_db() + + +def go(): + instance = TestEngine() + instance.run() + diff --git a/scraper/fetchBase.py b/scraper/fetchBase.py index 77fbc68..fbee380 100644 --- a/scraper/fetchBase.py +++ b/scraper/fetchBase.py @@ -8,13 +8,14 @@ import abc import hashlib import concurrent.futures +import tqdm import sqlalchemy.exc from sqlalchemy import desc from sqlalchemy import text from sqlalchemy.dialects.postgresql import insert import settings -import util.WebRequest +import WebRequest import scraper.runstate import scraper.database as db @@ -22,8 +23,8 @@ class AbstractFetcher(object, metaclass=abc.ABCMeta): worker_threads = 6 - @abc.abstractproperty - def content_count_max(self): + @abc.abstractmethod + def get_content_count_max(self, job): pass @abc.abstractproperty @@ -41,7 +42,7 @@ class AbstractFetcher(object, metaclass=abc.ABCMeta): def __init__(self): self.log = logging.getLogger(self.loggerpath) - self.wg = util.WebRequest.WebGetRobust(logPath=self.loggerpath+".Web") + self.wg = WebRequest.WebGetRobust(logPath=self.loggerpath+".Web") self.jobs_queued = [] @@ -182,25 +183,30 @@ class AbstractFetcher(object, metaclass=abc.ABCMeta): def do_upsert(self): - UPSERT_STEP = 10000 + UPSERT_STEP = 1000 sess = db.session() + total_changes = 0 - for x in range(self.content_count_max, 0, UPSERT_STEP * -1): + pbar = tqdm.tqdm(range(self.get_content_count_max(), -1, UPSERT_STEP * -1)) + for x in pbar: - self.log.info("[%s] - Building insert data structure %s -> %s", self.pluginkey, x, x+UPSERT_STEP) + # self.log.info("[%s] - Building insert data structure %s -> %s", self.pluginkey, x, x+UPSERT_STEP) dat = [{"state" : 'new', "postid" : x, "source" : self.pluginkey} for x in range(x, x+UPSERT_STEP)] - self.log.info("[%s] - Building insert query", self.pluginkey) + # self.log.info("[%s] - Building insert query", self.pluginkey) q = insert(db.Releases).values(dat) q = q.on_conflict_do_nothing() - self.log.info("[%s] - Built. Doing insert.", self.pluginkey) + # self.log.info("[%s] - Built. Doing insert.", self.pluginkey) ret = sess.execute(q) changes = ret.rowcount - self.log.info("[%s] - Changed rows: %s", self.pluginkey, changes) - sess.commit() - - if not changes: - break + total_changes += changes + # if changes != UPSERT_STEP: + # self.log.info("[%s] - Changed rows: %s", self.pluginkey, changes) + if changes: + sess.commit() + pbar.set_description("Changes: %s (%s)" % (changes, total_changes)) + # if not changes: + # break self.log.info("[%s] - Done.", self.pluginkey) diff --git a/scraper/modules/KonaChanFetch.py b/scraper/modules/KonaChanFetch.py index 13c6c4d..305b552 100644 --- a/scraper/modules/KonaChanFetch.py +++ b/scraper/modules/KonaChanFetch.py @@ -13,17 +13,28 @@ import scraper.runstate import scraper.database as db import scraper.fetchBase -import util.WebRequest +import WebRequest class KonaChanFetcher(scraper.fetchBase.AbstractFetcher): pluginkey = 'KonaChan' loggerpath = "Main.KonaChan" - content_count_max = 260000 def __init__(self): super().__init__() + + def get_content_count_max(self): + soup = self.wg.getSoup('https://konachan.com/post') + + thumbs = soup.find_all('li', class_=re.compile("creator-id-")) + tids = [tmp.get("id", "").strip("p") for tmp in thumbs] + tids = [int(tmp) for tmp in tids if tmp] + maxid = max(tids) + + return maxid + + def extractTags(self, job, tagsection): characterlis = tagsection.find_all('li', class_='tag-type-character') @@ -171,7 +182,7 @@ class KonaChanFetcher(scraper.fetchBase.AbstractFetcher): pageurl = 'https://konachan.com/post/show/{}'.format(job.postid) try: soup = self.wg.getSoup(pageurl) - except util.WebRequest.WebGetException: + except WebRequest.WebGetException: job.state = 'error' job.err_str = 'failure fetching container page' self.log.warning("Marking %s as %s (%s)", job.id, job.state, job.err_str) diff --git a/scraper/modules/danbooruFetch.py b/scraper/modules/danbooruFetch.py index c188a8f..2bfdbf5 100644 --- a/scraper/modules/danbooruFetch.py +++ b/scraper/modules/danbooruFetch.py @@ -13,17 +13,28 @@ import scraper.runstate import scraper.database as db import scraper.fetchBase -import util.WebRequest +import WebRequest class DanbooruFetcher(scraper.fetchBase.AbstractFetcher): pluginkey = 'Danbooru' loggerpath = "Main.Danbooru" - content_count_max = 2950000 def __init__(self): super().__init__() + + def get_content_count_max(self): + soup = self.wg.getSoup('https://danbooru.donmai.us/') + + thumbs = soup.find_all('article', class_='post-preview') + tids = [tmp.get("id", "").strip("post_") for tmp in thumbs] + tids = [int(tmp) for tmp in tids if tmp] + maxid = max(tids) + + return maxid + + def extractTags(self, job, tagsection): characterlis = tagsection.find_all('li', class_='category-4') @@ -141,7 +152,7 @@ class DanbooruFetcher(scraper.fetchBase.AbstractFetcher): pageurl = 'https://danbooru.donmai.us/posts/{}'.format(job.postid) try: soup = self.wg.getSoup(pageurl) - except util.WebRequest.WebGetException: + except WebRequest.WebGetException: job.state = 'error' job.err_str = 'failure fetching container page' self.log.warning("Marking %s as %s (%s)", job.id, job.state, job.err_str) @@ -185,7 +196,7 @@ class DanbooruFetcher(scraper.fetchBase.AbstractFetcher): except sqlalchemy.exc.IntegrityError: err += 1 db.session.rollback() - except util.WebRequest.WebGetException: + except WebRequest.WebGetException: job.state = 'error' job.err_str = 'failure fetching actual image' self.log.warning("Marking %s as %s (%s)", job.id, job.state, job.err_str) diff --git a/scraper/modules/e621Scrape.py b/scraper/modules/e621Scrape.py index e8b06cb..4e7ab97 100644 --- a/scraper/modules/e621Scrape.py +++ b/scraper/modules/e621Scrape.py @@ -13,17 +13,26 @@ import scraper.runstate import scraper.fetchBase import scraper.database as db -import util.WebRequest +import WebRequest class E621Fetcher(scraper.fetchBase.AbstractFetcher): pluginkey = 'e621' loggerpath = "Main.e621" - content_count_max = 1390000 def __init__(self): super().__init__() + def get_content_count_max(self): + soup = self.wg.getSoup('https://e621.net/post') + + thumbs = soup.find_all('span', class_='thumb') + tids = [tmp.get("id", "").strip("p") for tmp in thumbs] + tids = [int(tmp) for tmp in tids if tmp] + maxid = max(tids) + + return maxid + def extractTags(self, job, tagsection): characterlis = tagsection.find_all('li', class_='tag-type-character') @@ -157,7 +166,7 @@ class E621Fetcher(scraper.fetchBase.AbstractFetcher): pageurl = 'https://e621.net/post/show/{}'.format(job.postid) try: soup = self.wg.getSoup(pageurl) - except util.WebRequest.WebGetException: + except WebRequest.WebGetException: job.state = 'error' job.err_str = 'failure fetching container page' self.log.warning("Marking %s as %s (%s)", job.id, job.state, job.err_str) diff --git a/scraper/modules/gelbooruFetch.py b/scraper/modules/gelbooruFetch.py index 1ab6faa..6833d1e 100644 --- a/scraper/modules/gelbooruFetch.py +++ b/scraper/modules/gelbooruFetch.py @@ -13,19 +13,28 @@ import scraper.runstate import scraper.database as db import scraper.fetchBase -import util.WebRequest +import WebRequest class GelbooruFetcher(scraper.fetchBase.AbstractFetcher): pluginkey = 'Gelbooru' loggerpath = "Main.Gelbooru" - content_count_max = 4000000 def __init__(self): super().__init__() # db.session = db.Session() + + def get_content_count_max(self): + soup = self.wg.getSoup('https://gelbooru.com/index.php?page=post&s=list&tags=all') + + thumbs = soup.find_all('span', class_='thumb') + tids = [tmp.get("id", "").strip("s") for tmp in thumbs] + tids = [int(tmp) for tmp in tids if tmp] + maxid = max(tids) + return maxid + def extractTags(self, job, tagsection): characterlis = tagsection.find_all('li', class_='tag-type-character') @@ -163,7 +172,7 @@ class GelbooruFetcher(scraper.fetchBase.AbstractFetcher): time.sleep(13) else: break - except util.WebRequest.WebGetException: + except WebRequest.WebGetException: job.state = 'error' job.err_str = 'failure fetching container page' self.log.warning("Marking %s as %s (%s)", job.id, job.state, job.err_str) @@ -202,7 +211,7 @@ class GelbooruFetcher(scraper.fetchBase.AbstractFetcher): except sqlalchemy.exc.IntegrityError: err += 1 db.session.rollback() - except util.WebRequest.WebGetException: + except WebRequest.WebGetException: job.state = 'error' job.err_str = 'failure fetching actual image' self.log.warning("Marking %s as %s (%s)", job.id, job.state, job.err_str) diff --git a/scraper/modules/r34xxxScrape.py b/scraper/modules/r34xxxScrape.py index d4aa92f..926d731 100644 --- a/scraper/modules/r34xxxScrape.py +++ b/scraper/modules/r34xxxScrape.py @@ -13,17 +13,27 @@ import scraper.runstate import scraper.database as db import scraper.fetchBase -import util.WebRequest +import WebRequest class R34xxxFetcher(scraper.fetchBase.AbstractFetcher): pluginkey = 'Rule34.xxx' loggerpath = "Main.Rule34-xxx" - content_count_max = 2580000 def __init__(self): super().__init__() + + def get_content_count_max(self): + soup = self.wg.getSoup('https://rule34.xxx/index.php?page=post&s=list') + + thumbs = soup.find_all('span', class_='thumb') + tids = [tmp.get("id", "").strip("s") for tmp in thumbs] + tids = [int(tmp) for tmp in tids if tmp] + maxid = max(tids) + return maxid + + def extractTags(self, job, tagsection): characterlis = tagsection.find_all('li', class_='tag-type-character') @@ -153,13 +163,13 @@ class R34xxxFetcher(scraper.fetchBase.AbstractFetcher): time.sleep(13) else: break - except util.WebRequest.WebGetException: + except WebRequest.WebGetException: job.state = 'error' job.err_str = 'failure fetching container page' self.log.warning("Marking %s as %s (%s)", job.id, job.state, job.err_str) db.session.commit() return - except util.WebRequest.RedirectedError: + except WebRequest.RedirectedError: job.state = 'error' job.err_str = 'Content page redirected' self.log.warning("Marking %s as %s (%s)", job.id, job.state, job.err_str) @@ -206,7 +216,7 @@ class R34xxxFetcher(scraper.fetchBase.AbstractFetcher): db.session.rollback() break - except util.WebRequest.WebGetException: + except WebRequest.WebGetException: job.state = 'error' job.err_str = 'failure fetching actual image' self.log.warning("Marking %s as %s (%s)", job.id, job.state, job.err_str) diff --git a/scraper/modules/tbibFetch.py b/scraper/modules/tbibFetch.py index a378e69..f1212e8 100644 --- a/scraper/modules/tbibFetch.py +++ b/scraper/modules/tbibFetch.py @@ -13,13 +13,12 @@ import scraper.runstate import scraper.database as db import scraper.fetchBase -import util.WebRequest +import WebRequest class TbibFetcher(scraper.fetchBase.AbstractFetcher): pluginkey = 'TBIB' loggerpath = "Main.TBIB" - content_count_max = 6360000 def __init__(self): super().__init__() @@ -27,6 +26,15 @@ class TbibFetcher(scraper.fetchBase.AbstractFetcher): # db.session = db.Session() + def get_content_count_max(self): + soup = self.wg.getSoup('http://tbib.org/index.php?page=post&s=list') + + thumbs = soup.find_all('span', class_='thumb') + tids = [tmp.get("id", "").strip("s") for tmp in thumbs] + tids = [int(tmp) for tmp in tids if tmp] + maxid = max(tids) + return maxid + def extractTags(self, job, tagsection): characterlis = tagsection.find_all('li', class_='tag-type-character') @@ -165,13 +173,13 @@ class TbibFetcher(scraper.fetchBase.AbstractFetcher): time.sleep(13) else: break - except util.WebRequest.WebGetException: + except WebRequest.WebGetException: job.state = 'error' job.err_str = 'failure fetching container page' self.log.warning("Marking %s as %s (%s)", job.id, job.state, job.err_str) db.session.commit() return - except util.WebRequest.RedirectedError: + except WebRequest.RedirectedError: job.state = 'error' job.err_str = 'Content page redirected' self.log.warning("Marking %s as %s (%s)", job.id, job.state, job.err_str) @@ -210,7 +218,7 @@ class TbibFetcher(scraper.fetchBase.AbstractFetcher): except sqlalchemy.exc.IntegrityError: err += 1 db.session.rollback() - except util.WebRequest.WebGetException: + except WebRequest.WebGetException: job.state = 'error' job.err_str = 'failure fetching actual image' self.log.warning("Marking %s as %s (%s)", job.id, job.state, job.err_str) diff --git a/scraper/modules/xbooruFetch.py b/scraper/modules/xbooruFetch.py index 823b4db..664b6d2 100644 --- a/scraper/modules/xbooruFetch.py +++ b/scraper/modules/xbooruFetch.py @@ -13,19 +13,27 @@ import scraper.runstate import scraper.database as db import scraper.fetchBase -import util.WebRequest +import WebRequest class XBooruFetcher(scraper.fetchBase.AbstractFetcher): pluginkey = 'XBooru' loggerpath = "Main.XBooru" - content_count_max = 710000 def __init__(self): super().__init__() # db.session = db.Session() + def get_content_count_max(self): + soup = self.wg.getSoup('https://xbooru.com/index.php?page=post&s=list') + + thumbs = soup.find_all('span', class_='thumb') + tids = [tmp.get("id", "").strip("s") for tmp in thumbs] + tids = [int(tmp) for tmp in tids if tmp] + maxid = max(tids) + return maxid + def extractTags(self, job, tagsection): @@ -154,7 +162,7 @@ class XBooruFetcher(scraper.fetchBase.AbstractFetcher): # print(fname) def processJob(self, job): - pageurl = 'http://tbib.org/index.php?page=post&s=view&id={}'.format(job.postid) + pageurl = 'https://xbooru.com/index.php?page=post&s=view&id={}'.format(job.postid) while 1: try: soup = self.wg.getSoup(pageurl) @@ -163,14 +171,14 @@ class XBooruFetcher(scraper.fetchBase.AbstractFetcher): time.sleep(13) else: break - except util.WebRequest.WebGetException: + except WebRequest.WebGetException: job.state = 'error' job.err_str = 'failure fetching container page' self.log.warning("Marking %s as %s (%s)", job.id, job.state, job.err_str) db.session.commit() return - if 'Gelbooru - Image List' in soup.title.get_text(): + if soup.title.get_text() == 'Xbooru ': self.log.warning("Image has been removed.") job.state = 'removed' job.err_str = 'image has been removed' @@ -202,7 +210,7 @@ class XBooruFetcher(scraper.fetchBase.AbstractFetcher): except sqlalchemy.exc.IntegrityError: err += 1 db.session.rollback() - except util.WebRequest.WebGetException: + except WebRequest.WebGetException: job.state = 'error' job.err_str = 'failure fetching actual image' self.log.warning("Marking %s as %s (%s)", job.id, job.state, job.err_str) diff --git a/scraper/runner.py b/scraper/runner.py index 5f786db..afbfec1 100644 --- a/scraper/runner.py +++ b/scraper/runner.py @@ -1,6 +1,5 @@ import logging - import threading import multiprocessing @@ -21,9 +20,9 @@ PLUGIN_CLASSES = [ # Ok: scraper.modules.e621Scrape.E621Fetcher, + scraper.modules.danbooruFetch.DanbooruFetcher, scraper.modules.KonaChanFetch.KonaChanFetcher, scraper.modules.r34xxxScrape.R34xxxFetcher, - scraper.modules.danbooruFetch.DanbooruFetcher, scraper.modules.tbibFetch.TbibFetcher, scraper.modules.xbooruFetch.XBooruFetcher, @@ -42,27 +41,28 @@ class RunEngine(object): self.log.info("Creating run contexts") - # for plugin in PLUGIN_CLASSES: - # plugin.run_scraper() + for plugin in PLUGIN_CLASSES: + instance = plugin() + instance.do_upsert() - threads = [] - try: - for plugin in PLUGIN_CLASSES: - th = threading.Thread(target=plugin.run_scraper, name=plugin.loggerpath) - threads.append(th) + # threads = [] + # try: + # for plugin in PLUGIN_CLASSES: + # th = threading.Thread(target=plugin.run_scraper, name=plugin.loggerpath) + # threads.append(th) - for thread in threads: - thread.start() + # for thread in threads: + # thread.start() - self.log.info("Waiting for workers to complete.") - for thread in threads: - thread.join() - except KeyboardInterrupt: - self.log.info("Waiting for executor.") - scraper.runstate.run = False - for thread in threads: - thread.join() + # self.log.info("Waiting for workers to complete.") + # for thread in threads: + # thread.join() + # except KeyboardInterrupt: + # self.log.info("Waiting for executor.") + # scraper.runstate.run = False + # for thread in threads: + # thread.join() def go(): instance = RunEngine() diff --git a/settings.py b/settings.py index 75de43c..8aec276 100644 --- a/settings.py +++ b/settings.py @@ -13,6 +13,6 @@ DATABASE_IP = "10.1.1.61" # This determines the path mask that will be used when deduplicating # hentai items. # If you aren't running the deduper, just specify something basic, like "/" -storeDir = r"/media/Storage/H/Danbooru/" +storeDir = r"/media/Extra/Bulk-Images/Danbooru/" diff --git a/util/WebRequest/ChromiumMixin.py b/util/WebRequest/ChromiumMixin.py deleted file mode 100644 index 2f38ba9..0000000 --- a/util/WebRequest/ChromiumMixin.py +++ /dev/null @@ -1,252 +0,0 @@ -#!/usr/bin/python3 - -import time -import logging -import random -import traceback -import urllib.parse -import threading -import multiprocessing -import gc - -import bs4 - -import ChromeController - -# from cachetools import LRUCache - -# class ChromeLRUCache(LRUCache): -# def __init__(self, *args, **kwargs): -# super().__init__(*args, **kwargs) -# self.log = logging.getLogger("Main.ChromeInterfaceCache") - -# def close_chrome(self, pop_key, to_del): -# try: -# self.log.info("LRU Cache is closing chromium interface for %s", pop_key) -# to_del.close() -# except Exception: -# self.log.error("Exception in chromium teardown!") -# for line in traceback.format_exc().split("\n"): -# self.log.error(" %s", line) - -# def popitem(self): -# pop_key, to_del = super().popitem() -# self.close_chrome(pop_key, to_del) - -# def close_by_key(self, key): -# pop_key, to_del = self.pop(key) -# self.close_chrome(pop_key, to_del) - - -# def get_chromium_instance(self, cr_binary, cr_port): -# cpid = multiprocessing.current_process().name -# ctid = threading.current_thread().name -# csid = "{}-{}".format(cpid, ctid) - -# if csid in self: -# self.log.info("Using existing chromium process.") -# # We probe the remote chrome to make sure it's not defunct -# try: -# self[csid].get_current_url() -# return self[csid] -# except ChromeController.ChromeControllerException: -# self.log.error("Chromium appears to be defunct. Creating new") -# self.close_by_key(csid) - -# self.log.info("Creating Chromium process.") -# try: -# instance = ChromeController.ChromeRemoteDebugInterface(cr_binary, dbg_port = cr_port) -# except Exception as e: -# self.log.error("Failure creating chromium process!") -# for line in traceback.format_exc().split("\n"): -# self.log.error(" %s", line) - -# # Sometimes the old process is around because -# # the GC hasn't seen it, and forcing a collection can fix that. -# # Yes, this is HORRIBLE. -# gc.collect() - -# raise e - -# self[csid] = instance -# return instance - -# CHROME_CACHE = ChromeLRUCache(maxsize=2) - - -class WebGetCrMixin(object): - # creds is a list of 3-tuples that gets inserted into the password manager. - # it is structured [(top_level_url1, username1, password1), (top_level_url2, username2, password2)] - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self._cr_binary = "google-chrome" - - - def _syncIntoChromium(self, cr): - # Headers are a list of 2-tuples. We need a dict - hdict = dict(self.browserHeaders) - cr.update_headers(hdict) - for cookie in self.cj: - cr.set_cookie(cookie) - - def _syncOutOfChromium(self, cr): - for cookie in cr.get_cookies(): - self.cj.set_cookie(cookie) - - def getItemChromium(self, itemUrl): - self.log.info("Fetching page for URL: '%s' with Chromium" % itemUrl) - - with ChromeController.ChromeContext(self._cr_binary) as cr: - - self._syncIntoChromium(cr) - - response = cr.blocking_navigate_and_get_source(itemUrl, timeout=10) - - raw_url = cr.get_current_url() - fileN = urllib.parse.unquote(urllib.parse.urlparse(raw_url)[2].split("/")[-1]) - fileN = bs4.UnicodeDammit(fileN).unicode_markup - - self._syncOutOfChromium(cr) - - # Probably a bad assumption - if response['binary']: - mType = "application/x-binary" - else: - mType = "text/html" - - # So, self._cr.page_source appears to be the *compressed* page source as-rendered. Because reasons. - content = response['content'] - return content, fileN, mType - - def getHeadTitleChromium(self, url, referrer=None): - self.log.info("Getting HEAD with Chromium") - if not referrer: - referrer = url - - with ChromeController.ChromeContext(self._cr_binary) as cr: - self._syncIntoChromium(cr) - - cr.blocking_navigate(referrer) - time.sleep(random.uniform(2, 6)) - cr.blocking_navigate(url) - - title, cur_url = cr.get_page_url_title() - - self._syncOutOfChromium(cr) - - self.log.info("Resolved URL for %s -> %s", url, cur_url) - - ret = { - 'url': cur_url, - 'title': title, - } - return ret - - def getHeadChromium(self, url, referrer=None): - self.log.info("Getting HEAD with Chromium") - if not referrer: - referrer = url - - with ChromeController.ChromeContext(self._cr_binary) as cr: - self._syncIntoChromium(cr) - - - cr.blocking_navigate(referrer) - time.sleep(random.uniform(2, 6)) - cr.blocking_navigate(url) - - dummy_title, cur_url = cr.get_page_url_title() - - self._syncOutOfChromium(cr) - - return cur_url - - - def chromiumGetRenderedItem(self, url): - - with ChromeController.ChromeContext(self._cr_binary) as cr: - self._syncIntoChromium(cr) - - # get_rendered_page_source - cr.blocking_navigate(url) - - - content = cr.get_rendered_page_source() - mType = 'text/html' - fileN = '' - self._syncOutOfChromium(cr) - - - return content, fileN, mType - - - def __del__(self): - # print("ChromiumMixin destructor") - sup = super() - if hasattr(sup, '__del__'): - sup.__del__() - - # def stepThroughCloudFlare_cr(self, url, titleContains='', titleNotContains=''): - # ''' - # Use Selenium+Chromium to access a resource behind cloudflare protection. - - # Params: - # ``url`` - The URL to access that is protected by cloudflare - # ``titleContains`` - A string that is in the title of the protected page, and NOT the - # cloudflare intermediate page. The presence of this string in the page title - # is used to determine whether the cloudflare protection has been successfully - # penetrated. - - # The current WebGetRobust headers are installed into the selenium browser, which - # is then used to access the protected resource. - - # Once the protected page has properly loaded, the cloudflare access cookie is - # then extracted from the selenium browser, and installed back into the WebGetRobust - # instance, so it can continue to use the cloudflare auth in normal requests. - - # ''' - - # if (not titleContains) and (not titleNotContains): - # raise ValueError("You must pass either a string the title should contain, or a string the title shouldn't contain!") - - # if titleContains and titleNotContains: - # raise ValueError("You can only pass a single conditional statement!") - - # self.log.info("Attempting to access page through cloudflare browser verification.") - - # dcap = dict(DesiredCapabilities.Chromium) - # wgSettings = dict(self.browserHeaders) - - # # Install the headers from the WebGet class into Chromium - # dcap["Chromium.page.settings.userAgent"] = wgSettings.pop('User-Agent') - # for headerName in wgSettings: - # dcap['Chromium.page.customHeaders.{header}'.format(header=headerName)] = wgSettings[headerName] - - # driver = selenium.webdriver.Chromium(desired_capabilities=dcap) - # driver.set_window_size(1024, 768) - - # driver.get(url) - - # if titleContains: - # condition = EC.title_contains(titleContains) - # elif titleNotContains: - # condition = title_not_contains(titleNotContains) - # else: - # raise ValueError("Wat?") - - # try: - # WebDriverWait(driver, 20).until(condition) - # success = True - # self.log.info("Successfully accessed main page!") - # except TimeoutException: - # self.log.error("Could not pass through cloudflare blocking!") - # success = False - # # Add cookies to cookiejar - - # for cookie in driver.get_cookies(): - # self.addSeleniumCookie(cookie) - # #print cookie[u"value"] - - # self.__syncCookiesFromFile() - - # return success diff --git a/util/WebRequest/Constants.py b/util/WebRequest/Constants.py deleted file mode 100644 index 3b34004..0000000 --- a/util/WebRequest/Constants.py +++ /dev/null @@ -1,518 +0,0 @@ - -import random -random.seed() - - -# Due to general internet people douchebaggyness, I've basically said to hell with it and decided to spoof a whole assortment of browsers -# It should keep people from blocking this scraper *too* easily - -# This file generates a random browser user-agent, It should have an extremely large set of possible UA structures. -USER_AGENTS = [ - 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 10.0; WOW64; Trident/8.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; BIDUBrowser 2.x)', - 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)', - 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 1.0.3705; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)', - 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.2; WOW64; Trident/7.0; .NET4.0E; .NET4.0C; Media Center PC 6.0; .NET CLR 3.5.30729; .NET CLR 2.0.50727; .NET CLR 3.0.30729; InfoPath.3; ms-office; MSOffice 15)', - 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1)', - 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)', - 'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; 125LA; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)', - 'Mozilla/4.0 (compatible;)', - 'Mozilla/5.0', - 'Mozilla/5.0 (BB10; Kbd) AppleWebKit/537.35+ (KHTML, like Gecko) Version/10.3.2.2876 Mobile Safari/537.35+', - 'Mozilla/5.0 (compatible) Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)', - 'Mozilla/5.0 (compatible; AhrefsBot/5.2; +http://ahrefs.com/robot/)', - 'Mozilla/5.0 (compatible; archive.org_bot +http://www.archive.org/details/archive.org_bot)', - 'Mozilla/5.0 (compatible; archive.org_bot; Wayback Machine Live Record; +http://archive.org/details/archive.org_bot)', - 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)', - 'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)', - 'Mozilla/5.0 (compatible; coccocbot-web/1.0; +http://help.coccoc.com/searchengine)', - 'Mozilla/5.0 (compatible; Discordbot/2.0; +https://discordapp.com)', - 'Mozilla/5.0 (compatible; DotBot/1.1; http://www.opensiteexplorer.org/dotbot, help@moz.com)', - 'Mozilla/5.0 (compatible; DuckDuckGo-Favicons-Bot/1.0; +http://duckduckgo.com)', - 'Mozilla/5.0 (compatible; evc-batch/2.0.20170913102128)', - 'Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot)', - 'Mozilla/5.0 (compatible; FLinkhubbot/1.1; +hello@flinkhub.com )', - 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', - 'Mozilla/5.0 (compatible; linkdexbot/2.2; +http://www.linkdex.com/bots/)', - 'Mozilla/5.0 (compatible; Linux x86_64; Mail.RU_Bot/2.0; +http://go.mail.ru/help/robots)', - 'Mozilla/5.0 (compatible; MJ12bot/v1.4.7; http://mj12bot.com/)', - 'Mozilla/5.0 (compatible; MJ12bot/v1.4.8; http://mj12bot.com/)', - 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)', - 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0; MASBJS)', - 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; Trident/5.0)', - 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; Trident/5.0)', - 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)', - 'Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; NOKIA; Lumia 900)', - 'Mozilla/5.0 (compatible; Nmap Scripting Engine; https://nmap.org/book/nse.html)', - 'Mozilla/5.0 (compatible; Pinterestbot/1.0; +http://www.pinterest.com/bot.html)', - 'Mozilla/5.0 (compatible; SemrushBot-BA; +http://www.semrush.com/bot.html)', - 'Mozilla/5.0 (compatible; SemrushBot/1.2~bl; +http://www.semrush.com/bot.html)', - 'Mozilla/5.0 (compatible; SeznamBot/3.2; +http://napoveda.seznam.cz/en/seznambot-intro/)', - 'Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)', - 'Mozilla/5.0 (compatible; YandexAccessibilityBot/3.0; +http://yandex.com/bots)', - 'Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)', - 'Mozilla/5.0 (compatible; YandexImages/3.0; +http://yandex.com/bots)', - 'Mozilla/5.0 (iPad; CPU OS 10_0_1 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/14A403 Safari/602.1', - 'Mozilla/5.0 (iPad; CPU OS 10_1_1 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) CriOS/61.0.3163.73 Mobile/14B100 Safari/602.1', - 'Mozilla/5.0 (iPad; CPU OS 10_1_1 like Mac OS X) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0 Mobile/14B100 Safari/602.1', - 'Mozilla/5.0 (iPad; CPU OS 10_2 like Mac OS X) AppleWebKit/602.3.12 (KHTML, like Gecko) Version/10.0 Mobile/14C92 Safari/602.1', - 'Mozilla/5.0 (iPad; CPU OS 10_2_1 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) CriOS/60.0.3112.89 Mobile/14D27 Safari/602.1', - 'Mozilla/5.0 (iPad; CPU OS 10_2_1 like Mac OS X) AppleWebKit/602.4.6 (KHTML, like Gecko) Version/10.0 Mobile/14D27 Safari/602.1', - 'Mozilla/5.0 (iPad; CPU OS 10_3 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E277 Safari/602.1', - 'Mozilla/5.0 (iPad; CPU OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1', - 'Mozilla/5.0 (iPad; CPU OS 10_3_2 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) CriOS/60.0.3112.89 Mobile/14F89 Safari/602.1', - 'Mozilla/5.0 (iPad; CPU OS 10_3_2 like Mac OS X) AppleWebKit/603.2.4 (KHTML, like Gecko) Version/10.0 Mobile/14F89 Safari/602.1', - 'Mozilla/5.0 (iPad; CPU OS 10_3_3 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) GSA/34.1.167176684 Mobile/14G60 Safari/602.1', - 'Mozilla/5.0 (iPad; CPU OS 10_3_3 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) CriOS/60.0.3112.89 Mobile/14G60 Safari/602.1', - 'Mozilla/5.0 (iPad; CPU OS 10_3_3 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) CriOS/61.0.3163.73 Mobile/14G60 Safari/602.1', - 'Mozilla/5.0 (iPad; CPU OS 10_3_3 like Mac OS X) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.0 Mobile/14G60 Safari/602.1', - 'Mozilla/5.0 (iPad; CPU OS 5_1_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9B206 Safari/7534.48.3', - 'Mozilla/5.0 (iPad; CPU OS 7_1_1 like Mac OS X) AppleWebKit/537.51.2 (KHTML, like Gecko) Version/7.0 Mobile/11D201 Safari/9537.53', - 'Mozilla/5.0 (iPad; CPU OS 8_1_2 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B440 Safari/600.1.4', - 'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1', - 'Mozilla/5.0 (iPad; CPU OS 9_3_3 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13G34 Safari/601.1', - 'Mozilla/5.0 (iPad; CPU OS 9_3_5 like Mac OS X) AppleWebKit/601.1 (KHTML, like Gecko) CriOS/60.0.3112.89 Mobile/13G36 Safari/601.1.46', - 'Mozilla/5.0 (iPad; CPU OS 9_3_5 like Mac OS X) AppleWebKit/601.1 (KHTML, like Gecko) CriOS/61.0.3163.73 Mobile/13G36 Safari/601.1.46', - 'Mozilla/5.0 (iPad; CPU OS 9_3_5 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13G36 Safari/601.1', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:44.0) Gecko/20100101 Firefox/44.0', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:55.0) Gecko/20100101 Firefox/55.0', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:55.0) Gecko/20100101 Firefox/55.0', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:53.0) Gecko/20100101 Firefox/53.0', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:54.0) Gecko/20100101 Firefox/54.0', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:48.0) Gecko/20100101 Firefox/48.0', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:48.0) Gecko/20100101 Firefox/48.0', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:55.0) Gecko/20100101 Firefox/55.0', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/601.6.17 (KHTML, like Gecko) Version/9.1.1 Safari/601.6.17', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2752.0 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/601.7.8 (KHTML, like Gecko) Version/9.1.3 Safari/601.7.8', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/603.2.5 (KHTML, like Gecko) Version/10.1.1 Safari/603.2.5', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Safari/602.1.50', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/603.1.30 (KHTML, like Gecko)', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko)', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_5_8) AppleWebKit/534.50.2 (KHTML, like Gecko) Version/5.0.6 Safari/533.22.3', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/6.1.3 Safari/537.75.14', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/601.7.8 (KHTML, like Gecko) Version/9.1.3 Safari/537.86.7', - 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586', - 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.105 Safari/537.36 Vivaldi/1.92.917.43', - 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; rv:55.0) Gecko/20100101 Firefox/55.0', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063,gzip(gfe)', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36 OPR/47.0.2631.71', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36 OPR/47.0.2631.80', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.18 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.9 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3214.0 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; Trident/7.0; rv:11.0) like Gecko', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/7.0.6.1042 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2700.0 Iron Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.0.12195 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.0.12195 Safari/537.36,gzip(gfe)', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.11.2987.98 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.13.2987.98 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.2988.0 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.1.3029.81 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36 Vivaldi/1.91.867.42', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 YaBrowser/17.7.1.791 Yowser/2.5 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.105 Safari/537.36 Vivaldi/1.92.917.43', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 YaBrowser/17.9.1.449 (beta) Yowser/2.5 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36 OPR/47.0.2631.55', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36 OPR/47.0.2631.71', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36 OPR/47.0.2631.80', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36 OPR/47.0.2631.80 (Edition Campaign 34)', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.71 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) coc_coc_browser/61.4.120 Chrome/55.4.2883.120 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) coc_coc_browser/66.4.102 Chrome/60.4.3112.102 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) coc_coc_browser/66.4.104 Chrome/60.4.3112.104 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/5.0.4.3000 Chrome/47.0.2526.73 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0', - 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0', - 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0', - 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0', - 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0', - 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0', - 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0,gzip(gfe)', - 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; MATBJS; rv:11.0) like Gecko', - 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko', - 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.12 Safari/537.36', - 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36', - 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36', - 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36,gzip(gfe)', - 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 YaBrowser/17.3.0.1785 Yowser/2.5 Safari/537.36', - 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) coc_coc_browser/50.2.163 Chrome/44.2.2403.163 Safari/537.36', - 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) coc_coc_browser/56.3.154 Chrome/50.3.2661.154 Safari/537.36', - 'Mozilla/5.0 (Windows NT 5.1; rv:11.0) Gecko Firefox/11.0 (via ggpht.com GoogleImageProxy)', - 'Mozilla/5.0 (Windows NT 5.1; rv:25.0) Gecko/20100101 Firefox/25.0', - 'Mozilla/5.0 (Windows NT 5.1; rv:30.0) Gecko/20100101 Firefox/30.0', - 'Mozilla/5.0 (Windows NT 5.1; rv:43.0) Gecko/20100101 Firefox/43.0', - 'Mozilla/5.0 (Windows NT 5.1; rv:47.0) Gecko/20100101 Firefox/47.0', - 'Mozilla/5.0 (Windows NT 5.1; rv:52.0) Gecko/20100101 Firefox/52.0', - 'Mozilla/5.0 (Windows NT 5.1; rv:52.0) Gecko/20100101 Firefox/52.0,gzip(gfe)', - 'Mozilla/5.0 (Windows NT 5.1; rv:6.0.2) Gecko/20100101 Firefox/6.0.2', - 'Mozilla/5.0 (Windows NT 5.2; rv:52.0) Gecko/20100101 Firefox/52.0', - 'Mozilla/5.0 (Windows NT 6.0; rv:22.0) Gecko/20130405 Firefox/22.0', - 'Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36,gzip(gfe)', - 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/7.0.6.1042 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.273 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.84 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.1.3029.81 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 YaBrowser/17.7.1.791 Yowser/2.5 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36,gzip(gfe)', - 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36 OPR/47.0.2631.71', - 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36,gzip(gfe)', - 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) coc_coc_browser/66.4.104 Chrome/60.4.3112.104 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.9.5.1000 Chrome/39.0.2146.0 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; rv:38.0) Gecko/20100101 Firefox/38.0', - 'Mozilla/5.0 (Windows NT 6.1; rv:38.9) Gecko/20100101 Goanna/2.2 Firefox/38.9 PaleMoon/26.5.0', - 'Mozilla/5.0 (Windows NT 6.1; rv:42.0) Gecko/20100101 Firefox/42.0', - 'Mozilla/5.0 (Windows NT 6.1; rv:47.0) Gecko/20100101 Firefox/47.0', - 'Mozilla/5.0 (Windows NT 6.1; rv:53.0) Gecko/20100101 Firefox/53.0', - 'Mozilla/5.0 (Windows NT 6.1; rv:55.0) Gecko/20100101 Firefox/55.0', - 'Mozilla/5.0 (Windows NT 6.1; rv:55.0) Gecko/20100101 Firefox/55.0,gzip(gfe)', - 'Mozilla/5.0 (Windows NT 6.1; rv:56.0) Gecko/20100101 Firefox/56.0', - 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.85 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2540.0 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36 OPR/44.0.2510.1449', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36 OPR/47.0.2631.55', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36 OPR/47.0.2631.71', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36 OPR/47.0.2631.80', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36,gzip(gfe)', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.18 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:3.2) Goanna/20170821 PaleMoon/27.4.2', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:42.0) Gecko/20100101 Firefox/42.0', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:45.9) Gecko/20100101 Goanna/3.2 Firefox/45.9 PaleMoon/27.4.2', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:52.0) Gecko/20100101 Firefox/52.0', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:54.0) Gecko/20100101 Firefox/54.0', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534+ (KHTML, like Gecko) BingPreview/1.0b', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.102 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.149 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.154 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36; 360Spider', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/7.0.6.1042 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36 OPR/38.0.2220.29', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.1.2909.1213 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36,gzip(gfe)', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.0.12335 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.2991.0 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 YaBrowser/17.7.0.1683 Yowser/2.5 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 YaBrowser/17.7.1.791 Yowser/2.5 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 YaBrowser/17.7.1.804 Yowser/2.5 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36 OPR/46.0.2597.32', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36 Sleipnir/4.5.8', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.90 Safari/537.36 Vivaldi/1.91.867.38', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36 OPR/47.0.2631.71', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36 OPR/47.0.2631.71 (Edition 360-1)', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.4.3112.104 Safari/537.36,gzip(gfe)', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) coc_coc_browser/66.4.102 Chrome/60.4.3112.102 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) coc_coc_browser/66.4.104 Chrome/60.4.3112.104 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:25.0) Gecko/20100101 Firefox/25.0', - 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0', - 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0', - 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0', - 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1', - 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 IceDragon/40.1.1.18 Firefox/40.0.2', - 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0', - 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0', - 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:45.9) Gecko/20100101 Goanna/3.2 Firefox/45.9 PaleMoon/27.4.2', - 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0', - 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:48.0) Gecko/20100101 Firefox/48.0', - 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0', - 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0', - 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0', - 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.2.0) Gecko/52.2.0 Firefox/52.2.0; ADSSO', - 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0', - 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0', - 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0', - 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', - 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.2; rv:55.0) Gecko/20100101 Firefox/55.0', - 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0', - 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.12.2987.98 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0', - 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0', - 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0', - 'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko', - 'Mozilla/5.0 (Windows NT 6.3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.5.3029.81 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0', - 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36 OPR/44.0.2510.1449', - 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 OPR/45.0.0.255225845', - 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36,gzip(gfe)', - 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0', - 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0', - 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; Trident/7.0; Touch; LCJB; rv:11.0) like Gecko', - 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; Trident/7.0; Touch; rv:11.0) like Gecko', - 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.0.10802 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/7.0.6.1042 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.0.12137 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36 OPR/43.0.2442.1144', - 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.2988.0 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.90 Safari/537.36 Vivaldi/1.91.867.38', - 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36 OPR/47.0.2631.80', - 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) coc_coc_browser/66.4.104 Chrome/60.4.3112.104 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0', - 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0', - 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0', - 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; MAFSJS; rv:11.0) like Gecko', - 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko', - 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko/20100101 Firefox/12.0', - 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; Touch; LCJB; rv:11.0) like Gecko', - 'Mozilla/5.0 (Windows Phone 8.1; ARM; Trident/7.0; Touch; rv:11.0; IEMobile/11.0; NOKIA; Lumia 635) like Gecko', - 'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.0.7; Google-SearchByImage) Gecko/2009021910 Firefox/3.0.7', - 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.27 Safari/532.0', - 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.3) Gecko/20070309 Firefox/2.0.0.3', - 'Mozilla/5.0 (X11; CrOS x86_64 9460.73.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.134 Safari/537.36', - 'Mozilla/5.0 (X11; CrOS x86_64 9592.85.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.112 Safari/537.36', - 'Mozilla/5.0 (X11; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36', - 'Mozilla/5.0 (X11; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36', - 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', - 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.132 Safari/537.36', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36 Google Favicon', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/48.0.2564.82 Chrome/48.0.2564.82 Safari/537.36', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/60.0.3112.113 Chrome/60.0.3112.113 Safari/537.36', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/60.0.3112.78 Chrome/60.0.3112.78 Safari/537.36', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) WordPress.com mShots Safari/537.36', - 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)', - 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)', - 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', - 'Mozilla/5.0 (X11; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0', - 'Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.460.0 Safari/534.3', - 'Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.114 Safari/537.36 Puffin/5.2.0IT', - 'Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.114 Safari/537.36 Puffin/5.2.2IT', - 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:41.0) Gecko/20100101 Firefox/41.0', - 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:51.0) Gecko/20100101 Firefox/51.0', - 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0', - 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0', - 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0', - 'Opera/9.80 (BlackBerry; Opera Mini/8.0.35667/67.445; U; en) Presto/2.12.423 Version/12.16', - 'UCWEB/2.0 (Java; U; MIDP-2.0; Nokia203/20.37) U2/1.0.0 UCBrowser/8.7.0.218 U2/1.0.0 Mobile', - 'UCWEB/2.0 (MIDP-2.0; U; Adr 4.4.2; id; S35G) U2/1.0.0 UCBrowser/10.7.8.806 U2/1.0.0 Mobile', - 'UCWEB/2.0 (MIDP-2.0; U; Adr 5.0.1; en-US; GT-I9500) U2/1.0.0 UCBrowser/10.9.5.983 U2/1.0.0 Mobile', - 'UCWEB/2.0 (MIDP-2.0; U; Adr 5.1.1; en-US; A37f) U2/1.0.0 UCBrowser/10.9.8.1006 U2/1.0.0 Mobile', - 'UCWEB/2.0 (MIDP-2.0; U; Adr 5.1.1; en-US; SM-J200G) U2/1.0.0 UCBrowser/10.6.0.706 U2/1.0.0 Mobile', - 'UCWEB/2.0 (MIDP-2.0; U; Adr 6.0.1; id; ASUS_Z00LDD) U2/1.0.0 UCBrowser/10.9.5.983 U2/1.0.0 Mobile', - 'UCWEB/2.0 (MIDP-2.0; U; Adr 6.0; airg.com; S8_mini) U2/1.0.0 UCBrowser/9.6.0.514 U2/1.0.0 Mobile', - 'UCWEB/2.0 (Windows; U; wds 8.10; en-IN; NOKIA; RM-978_1046) U2/1.0.0 UCBrowser/4.2.1.541 U2/1.0.0 Mobile', - - -] - -ACCEPT_LANGUAGE =[ - - "en-gb,en-us;q=0.7,de-ch;q=0.3", - "en-GB,en-US;q=0.8,en;q=0.6", - "en-GB,en-US;q=0.8,en;q=0.6", - "en-US", - "en-us, en;q=1.0,fr-ca, fr;q=0.5,pt-br, pt;q=0.5,es;q=0.5", - "en-US,de-DE;q=0.5", - "en-us,en;q=0.5", - "en-US,en;q=0.8", - "en-US,en;q=0.8,en-GB;q=0.6,fr-CA;q=0.4,fr;q=0.2", - "en-US,en;q=0.8,es-419;q=0.6", - "en-us,en;q=0.8,es;q=0.5,es-mx;q=0.3", - "en-US,en;q=0.8,es;q=0.6", - "en-US,en;q=0.8,pl;q=0.6", - "en-US,en;q=0.8,pl;q=0.6", - "en-US,en;q=0.9", - "en-US,en;q=0.9,fr;q=0.8,de;q=0.7,id;q=0.6", - "en-US,en;q=0.9,ja;q=0.8,fr;q=0.7,de;q=0.6,es;q=0.5,it;q=0.4,nl;q=0.3,sv;q=0.2,nb;q=0.1", - -] - -ACCEPT = [ - ["text/html","application/xhtml+xml","application/xml;q=0.9"], - ["application/xml","application/xhtml+xml","text/html;q=0.9"," text/plain;q=0.8","image/png"], - ["text/html","application/xhtml+xml","application/xml;q=0.9"], - ["image/jpeg","application/x-ms-application","image/gif","application/xaml+xml","image/pjpeg","application/x-ms-xbap","application/x-shockwave-flash","application/msword"], - ["text/html","application/xml;q=0.9","application/xhtml+xml","image/png","image/webp","image/jpeg","image/gif","image/x-xbitmap"] -] - -ACCEPT_POSTFIX = ["*/*;q=0.8", "*/*;q=0.5", "*/*;q=0.8", "*/*", "*/*;q=0.1"] - -ENCODINGS = [['gzip'], ['gzip', 'deflate'], ['gzip', 'deflate', 'sdch']] - - -def getUserAgent(): - ''' - Generate a randomized user agent by permuting a large set of possible values. - The returned user agent should look like a valid, in-use brower, with a specified preferred language of english. - - Return value is a list of tuples, where each tuple is one of the user-agent headers. - - Currently can provide approximately 147 * 17 * 5 * 5 * 2 * 3 * 2 values, or ~749K possible - unique user-agents. - ''' - coding = random.choice(ENCODINGS) - random.shuffle(coding) - coding = ",".join(coding) - - accept = random.choice(ACCEPT) - random.shuffle(accept) - accept.append(random.choice(ACCEPT_POSTFIX)) - accept = random.choice((", ", ",")).join(accept) - - user_agent = [ - ('User-Agent' , random.choice(USER_AGENTS)), - ('Accept-Language' , random.choice(ACCEPT_LANGUAGE)), - ('Accept' , accept), - ('Accept-Encoding' , coding) - ] - return user_agent - - - - -# This file based heavily on the UA List, Copyright (c) 2014, Harald Hope -# This list was released under the BSD 2 clause. - -# Home page: techpatterns.com/forums/about304.html - -# Special thanks to the following: -# User-Agent Switcher: www.chrispederick.com/work/user-agent-switcher -# Firefox history: www.zytrax.com/tech/web/firefox-history.html -# Mobile data: wikipedia.org/wiki/List_of_user_agents_for_mobile_phones -# Mobile data: www.zytrax.com/tech/web/mobile_ids.html -# Current User-Agents: http://myip.ms/browse/comp_browsers -# User-agent data: www.zytrax.com/tech/web/browser_ids.htm -# User-agent strings: www.useragentstring.com -# User-agent strings: www.webapps-online.com/online-tools/user-agent-strings/dv/ - -# License: BSD 2 Clause -# All rights reserved. Redistribution and use in source and binary forms, -# with or without modification, are permitted provided that the following -# conditions are met: -# 1. Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# 2. Redistributions in binary form must reproduce the above copyright notice, this -# list of conditions and the following disclaimer in the documentation and/or other -# materials provided with the distribution. -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER -# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. - diff --git a/util/WebRequest/Exceptions.py b/util/WebRequest/Exceptions.py deleted file mode 100644 index 2d87ed7..0000000 --- a/util/WebRequest/Exceptions.py +++ /dev/null @@ -1,18 +0,0 @@ - - - -class WebGetException(Exception): - pass - -class RedirectedError(WebGetException): - pass - -class ContentTypeError(WebGetException): - pass - -class ArgumentError(WebGetException): - pass - -class FetchFailureError(WebGetException): - pass - diff --git a/util/WebRequest/Handlers.py b/util/WebRequest/Handlers.py deleted file mode 100644 index e17aef8..0000000 --- a/util/WebRequest/Handlers.py +++ /dev/null @@ -1,147 +0,0 @@ -#!/usr/bin/python3 -import sys -import codecs - -import http.client -import email.parser - -import urllib.request -import urllib.parse -import urllib.error - -import os.path - -import time -import http.cookiejar - -import traceback - -import logging -import zlib -import bs4 -import re -import string -import gzip -import io -import socket -import json -import base64 - -import random - -class HeadRequest(urllib.request.Request): - def get_method(self): - # Apparently HEAD is now being blocked. Because douche. - return "GET" - # return "HEAD" - -class HTTPRedirectBlockerErrorHandler(urllib.request.HTTPErrorProcessor): # pragma: no cover - - def http_response(self, request, response): - code, msg, hdrs = response.code, response.msg, response.info() - - # only add this line to stop 302 redirection. - if code == 302: - print("Code!", 302) - return response - if code == 301: - print("Code!", 301) - return response - - print("[HTTPRedirectBlockerErrorHandler] http_response! code:", code) - print(hdrs) - print(msg) - if not (200 <= code < 300): - response = self.parent.error('http', request, response, code, msg, hdrs) - return response - - https_response = http_response - -# Custom redirect handler to work around -# issue https://bugs.python.org/issue17214 -class HTTPRedirectHandler(urllib.request.HTTPRedirectHandler): - # Implementation note: To avoid the server sending us into an - # infinite loop, the request object needs to track what URLs we - # have already seen. Do this by adding a handler-specific - # attribute to the Request object. - def http_error_302(self, req, fp, code, msg, headers): - # Some servers (incorrectly) return multiple Location headers - # (so probably same goes for URI). Use first header. - if "location" in headers: - newurl = headers["location"] - elif "uri" in headers: - newurl = headers["uri"] - else: - return - - # fix a possible malformed URL - urlparts = urllib.parse.urlparse(newurl) - - # For security reasons we don't allow redirection to anything other - # than http, https or ftp. - - if urlparts.scheme not in ('http', 'https', 'ftp', ''): - raise urllib.error.HTTPError( - newurl, code, - "%s - Redirection to url '%s' is not allowed" % (msg, newurl), - headers, fp) - - if not urlparts.path: - urlparts = list(urlparts) - urlparts[2] = "/" - - newurl = urllib.parse.urlunparse(urlparts) - - # http.client.parse_headers() decodes as ISO-8859-1. Recover the - # original bytes and percent-encode non-ASCII bytes, and any special - # characters such as the space. - newurl = urllib.parse.quote( - newurl, encoding="iso-8859-1", safe=string.punctuation) - newurl = urllib.parse.urljoin(req.full_url, newurl) - - # XXX Probably want to forget about the state of the current - # request, although that might interact poorly with other - # handlers that also use handler-specific request attributes - new = self.redirect_request(req, fp, code, msg, headers, newurl) - if new is None: # pragma: no cover - return - - # loop detection - # .redirect_dict has a key url if url was previously visited. - if hasattr(req, 'redirect_dict'): - visited = new.redirect_dict = req.redirect_dict - if (visited.get(newurl, 0) >= self.max_repeats or - len(visited) >= self.max_redirections): - raise urllib.error.HTTPError(req.full_url, code, - self.inf_msg + msg, headers, fp) - else: - visited = new.redirect_dict = req.redirect_dict = {} - visited[newurl] = visited.get(newurl, 0) + 1 - - # Don't close the fp until we are sure that we won't use it - # with HTTPError. - fp.read() - fp.close() - - return self.parent.open(new, timeout=req.timeout) - -class PreemptiveBasicAuthHandler(urllib.request.HTTPBasicAuthHandler): - '''Preemptive basic auth. - - Instead of waiting for a 403 to then retry with the credentials, - send the credentials if the url is handled by the password manager. - Note: please use realm=None when calling add_password.''' - def http_request(self, req): - url = req.get_full_url() - realm = None - # this is very similar to the code from retry_http_basic_auth() - # but returns a request object. - user, pw = self.passwd.find_user_password(realm, url) - if pw: - raw = "%s:%s" % (user, pw) - raw = raw.encode("ascii") - auth = b'Basic ' + base64.standard_b64encode(raw).strip() - req.add_unredirected_header(self.auth_header, auth) - return req - - https_request = http_request diff --git a/util/WebRequest/HeaderParseMonkeyPatch.py b/util/WebRequest/HeaderParseMonkeyPatch.py deleted file mode 100644 index 1de5401..0000000 --- a/util/WebRequest/HeaderParseMonkeyPatch.py +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/python3 -import sys -import codecs - -import http.client -import email.parser - -cchardet = False - -try: - import cchardet -except ImportError: # pragma: no cover - pass - -def isUTF8Strict(data): # pragma: no cover - Only used when cchardet is missing. - ''' - Check if all characters in a bytearray are decodable - using UTF-8. - ''' - try: - decoded = data.decode('UTF-8') - except UnicodeDecodeError: - return False - else: - for ch in decoded: - if 0xD800 <= ord(ch) <= 0xDFFF: - return False - return True - -def decode_headers(header_list): - ''' - Decode a list of headers. - - Takes a list of bytestrings, returns a list of unicode strings. - The character set for each bytestring is individually decoded. - ''' - - decoded_headers = [] - for header in header_list: - if cchardet: - inferred = cchardet.detect(header) - if inferred and inferred['confidence'] > 0.8: - # print("Parsing headers!", header) - decoded_headers.append(header.decode(inferred['encoding'])) - else: - decoded_headers.append(header.decode('iso-8859-1')) - else: # pragma: no cover - # All bytes are < 127 (e.g. ASCII) - if all([char & 0x80 == 0 for char in header]): - decoded_headers.append(header.decode("us-ascii")) - elif isUTF8Strict(header): - decoded_headers.append(header.decode("utf-8")) - else: - decoded_headers.append(header.decode('iso-8859-1')) - - return decoded_headers - - -def parse_headers(fp, _class=http.client.HTTPMessage): - """Parses only RFC2822 headers from a file pointer. - - email Parser wants to see strings rather than bytes. - But a TextIOWrapper around self.rfile would buffer too many bytes - from the stream, bytes which we later need to read as bytes. - So we read the correct bytes here, as bytes, for email Parser - to parse. - - Note: Monkey-patched version to try to more intelligently determine - header encoding - - """ - headers = [] - while True: - line = fp.readline(http.client._MAXLINE + 1) - if len(line) > http.client._MAXLINE: - raise http.client.LineTooLong("header line") - headers.append(line) - if len(headers) > http.client._MAXHEADERS: - raise HTTPException("got more than %d headers" % http.client._MAXHEADERS) - if line in (b'\r\n', b'\n', b''): - break - - decoded_headers = decode_headers(headers) - - hstring = ''.join(decoded_headers) - - return email.parser.Parser(_class=_class).parsestr(hstring) - -http.client.parse_headers = parse_headers diff --git a/util/WebRequest/PhantomJSMixin.py b/util/WebRequest/PhantomJSMixin.py deleted file mode 100644 index 1743aa9..0000000 --- a/util/WebRequest/PhantomJSMixin.py +++ /dev/null @@ -1,277 +0,0 @@ -#!/usr/bin/python3 - -import time -import random -import socket -import urllib.parse -import http.cookiejar -import bs4 -import selenium.webdriver -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from selenium.common.exceptions import TimeoutException -from selenium.webdriver.common.desired_capabilities import DesiredCapabilities - - - - -class title_not_contains(object): - """ An expectation for checking that the title *does not* contain a case-sensitive - substring. title is the fragment of title expected - returns True when the title matches, False otherwise - """ - def __init__(self, title): - self.title = title - - - def __call__(self, driver): - return self.title not in driver.title - -#pylint: disable-msg=E1101, C0325, R0201, W0702, W0703 - -def wait_for(condition_function): - start_time = time.time() - while time.time() < start_time + 3: - if condition_function(): - return True - else: - time.sleep(0.1) - raise Exception( - 'Timeout waiting for {}'.format(condition_function.__name__) - ) - -class load_delay_context_manager(object): - - def __init__(self, browser): - self.browser = browser - - def __enter__(self): - self.old_page = self.browser.find_element_by_tag_name('html') - - def page_has_loaded(self): - new_page = self.browser.find_element_by_tag_name('html') - return new_page.id != self.old_page.id - - def __exit__(self, *_): - wait_for(self.page_has_loaded) - - -class WebGetPjsMixin(object): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - self.pjs_driver = None - - def _initPjsWebDriver(self): - if self.pjs_driver: - self.pjs_driver.quit() - dcap = dict(DesiredCapabilities.PHANTOMJS) - wgSettings = dict(self.browserHeaders) - # Install the headers from the WebGet class into phantomjs - dcap["phantomjs.page.settings.userAgent"] = wgSettings.pop('User-Agent') - for headerName in wgSettings: - if headerName != 'Accept-Encoding': - dcap['phantomjs.page.customHeaders.{header}'.format(header=headerName)] = wgSettings[headerName] - - self.pjs_driver = selenium.webdriver.PhantomJS(desired_capabilities=dcap) - self.pjs_driver.set_window_size(1280, 1024) - - - def _syncIntoPjsWebDriver(self): - ''' - So selenium is completely retarded, and you can't just set cookes, you have to - be navigated to the domain for which you want to set cookies. - This is extra double-plus idiotic, as it means you can't set cookies up - before navigating. - Sigh. - ''' - pass - # for cookie in self.getCookies(): - # print("Cookie: ", cookie) - - # cookurl = [ - # "http" if cookieDict['httponly'] else "https", # scheme 0 URL scheme specifier - # cookie.domain, # netloc 1 Network location part - # "/", # path 2 Hierarchical path - # "", # params 3 Parameters for last path element - # "", # query 4 Query component - # "", # fragment 5 Fragment identifier - # ] - - # cdat = { - # 'name' : cookie.name, - # 'value' : cookie.value, - # 'domain' : cookie.domain, - # 'path' : - # 'expiry' : - # } - # print("CDat: ", cdat) - - # self.pjs_driver.add_cookie(cdat) - - - def _syncOutOfPjsWebDriver(self): - for cookie in self.pjs_driver.get_cookies(): - self.addSeleniumCookie(cookie) - - - def getItemPhantomJS(self, itemUrl): - self.log.info("Fetching page for URL: '%s' with PhantomJS" % itemUrl) - - if not self.pjs_driver: - self._initPjsWebDriver() - self._syncIntoPjsWebDriver() - - with load_delay_context_manager(self.pjs_driver): - self.pjs_driver.get(itemUrl) - time.sleep(3) - - fileN = urllib.parse.unquote(urllib.parse.urlparse(self.pjs_driver.current_url)[2].split("/")[-1]) - fileN = bs4.UnicodeDammit(fileN).unicode_markup - - self._syncOutOfPjsWebDriver() - - # Probably a bad assumption - mType = "text/html" - - # So, self.pjs_driver.page_source appears to be the *compressed* page source as-rendered. Because reasons. - source = self.pjs_driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML") - - assert source != '' - - source = ""+source+"" - return source, fileN, mType - - - - def getHeadTitlePhantomJS(self, url, referrer=None): - self.getHeadPhantomJS(url, referrer) - ret = { - 'url' : self.pjs_driver.current_url, - 'title' : self.pjs_driver.title, - } - return ret - - def getHeadPhantomJS(self, url, referrer=None): - self.log.info("Getting HEAD with PhantomJS") - - if not self.pjs_driver: - self._initPjsWebDriver() - self._syncIntoPjsWebDriver() - - def try_get(loc_url): - tries = 3 - for x in range(9999): - try: - self.pjs_driver.get(loc_url) - time.sleep(random.uniform(2, 6)) - return - except socket.timeout as e: - if x > tries: - raise e - if referrer: - try_get(referrer) - try_get(url) - - self._syncOutOfPjsWebDriver() - - return self.pjs_driver.current_url - - def addSeleniumCookie(self, cookieDict): - ''' - Install a cookie exported from a selenium webdriver into - the active opener - ''' - # print cookieDict - cookie = http.cookiejar.Cookie( - version = 0, - name = cookieDict['name'], - value = cookieDict['value'], - port = None, - port_specified = False, - domain = cookieDict['domain'], - domain_specified = True, - domain_initial_dot = False, - path = cookieDict['path'], - path_specified = False, - secure = cookieDict['secure'], - expires = cookieDict['expiry'] if 'expiry' in cookieDict else None, - discard = False, - comment = None, - comment_url = None, - rest = {"httponly":"%s" % cookieDict['httponly']}, - rfc2109 = False - ) - - self.addCookie(cookie) - - - def __del__(self): - # print("PhantomJS __del__") - if self.pjs_driver != None: - self.pjs_driver.quit() - - sup = super() - if hasattr(sup, '__del__'): - sup.__del__() - - - def stepThroughCloudFlare_pjs(self, url, titleContains='', titleNotContains=''): - ''' - Use Selenium+PhantomJS to access a resource behind cloudflare protection. - - Params: - ``url`` - The URL to access that is protected by cloudflare - ``titleContains`` - A string that is in the title of the protected page, and NOT the - cloudflare intermediate page. The presence of this string in the page title - is used to determine whether the cloudflare protection has been successfully - penetrated. - - The current WebGetRobust headers are installed into the selenium browser, which - is then used to access the protected resource. - - Once the protected page has properly loaded, the cloudflare access cookie is - then extracted from the selenium browser, and installed back into the WebGetRobust - instance, so it can continue to use the cloudflare auth in normal requests. - - ''' - - if (not titleContains) and (not titleNotContains): - raise ValueError("You must pass either a string the title should contain, or a string the title shouldn't contain!") - - if titleContains and titleNotContains: - raise ValueError("You can only pass a single conditional statement!") - - self.log.info("Attempting to access page through cloudflare browser verification.") - - if not self.pjs_driver: - self._initPjsWebDriver() - self._syncIntoPjsWebDriver() - - - self.pjs_driver.get(url) - - if titleContains: - condition = EC.title_contains(titleContains) - elif titleNotContains: - condition = title_not_contains(titleNotContains) - else: - raise ValueError("Wat?") - - - try: - WebDriverWait(self.pjs_driver, 20).until(condition) - success = True - self.log.info("Successfully accessed main page!") - except TimeoutException: - self.log.error("Could not pass through cloudflare blocking!") - success = False - # Add cookies to cookiejar - - self._syncOutOfPjsWebDriver() - - self.__syncCookiesFromFile() - - return success - - diff --git a/util/WebRequest/WebRequestClass.py b/util/WebRequest/WebRequestClass.py deleted file mode 100644 index ef24e09..0000000 --- a/util/WebRequest/WebRequestClass.py +++ /dev/null @@ -1,859 +0,0 @@ -#!/usr/bin/python3 -import urllib.request -import urllib.parse -import urllib.error - - -import os.path - -import time -import http.cookiejar - -import traceback - -import logging -import zlib -import codecs -import re -import sys -import gzip -import io -import socket -import json - -from threading import Lock - -import bs4 -try: - import socks - from sockshandler import SocksiPyHandler - HAVE_SOCKS = True -except ImportError: - HAVE_SOCKS = False - -from . import HeaderParseMonkeyPatch - -from . import ChromiumMixin -from . import PhantomJSMixin -from . import Handlers -from . import iri2uri -from . import Constants -from . import Exceptions - -#pylint: disable-msg=E1101, C0325, R0201, W0702, W0703 - -COOKIEWRITELOCK = Lock() - -GLOBAL_COOKIE_FILE = None - -def as_soup(str): - return bs4.BeautifulSoup(str, "lxml") - - -def determine_json_encoding(json_bytes): - ''' - Given the fact that the first 2 characters in json are guaranteed to be ASCII, we can use - these to determine the encoding. - See: http://tools.ietf.org/html/rfc4627#section-3 - - Copied here: - Since the first two characters of a JSON text will always be ASCII - characters [RFC0020], it is possible to determine whether an octet - stream is UTF-8, UTF-16 (BE or LE), or UTF-32 (BE or LE) by looking - at the pattern of nulls in the first four octets. - - 00 00 00 xx UTF-32BE - 00 xx 00 xx UTF-16BE - xx 00 00 00 UTF-32LE - xx 00 xx 00 UTF-16LE - xx xx xx xx UTF-8 - ''' - - assert(isinstance(json_bytes, bytes)) - - if len(json_bytes) > 4: - b1, b2, b3, b4 = json_bytes[0], json_bytes[1], json_bytes[2], json_bytes[3] - if b1 == 0 and b2 == 0 and b3 == 0 and b4 != 0: - return "UTF-32BE" - elif b1 == 0 and b2 != 0 and b3 == 0 and b4 != 0: - return "UTF-16BE" - elif b1 != 0 and b2 == 0 and b3 == 0 and b4 == 0: - return "UTF-32LE" - elif b1 != 0 and b2 == 0 and b3 != 0 and b4 == 0: - return "UTF-16LE" - elif b1 != 0 and b2 != 0 and b3 != 0 and b4 != 0: - return "UTF-8" - else: - raise Exceptions.ContentTypeError("Unknown encoding!") - - elif len(json_bytes) > 2: - b1, b2 = json_bytes[0], json_bytes[1] - if b1 == 0 and b2 == 0: - return "UTF-32BE" - elif b1 == 0 and b2 != 0: - return "UTF-16BE" - elif b1 != 0 and b2 == 0: - raise Exceptions.ContentTypeError("Json string too short to definitively infer encoding.") - elif b1 != 0 and b2 != 0: - return "UTF-8" - else: - raise Exceptions.ContentTypeError("Unknown encoding!") - - raise Exceptions.ContentTypeError("Input string too short to guess encoding!") - - -# A urllib2 wrapper that provides error handling and logging, as well as cookie management. It's a bit crude, but it works. -# Also supports transport compresion. -# OOOOLLLLLLDDDDD, has lots of creaky internals. Needs some cleanup desperately, but lots of crap depends on almost everything. -# Arrrgh. - -class WebGetRobust(PhantomJSMixin.WebGetPjsMixin, ChromiumMixin.WebGetCrMixin): - - COOKIEFILE = 'cookies.lwp' # the path and filename to save your cookies in - cj = None - cookielib = None - opener = None - - errorOutCount = 2 - # retryDelay = 0.1 - retryDelay = 0.01 - - data = None - - # creds is a list of 3-tuples that gets inserted into the password manager. - # it is structured [(top_level_url1, username1, password1), (top_level_url2, username2, password2)] - def __init__(self, creds=None, logPath="Main.WebRequest", cookie_lock=None, cloudflare=False, use_socks=False, alt_cookiejar=None): - super().__init__() - - self.rules = {} - self.rules['cloudflare'] = cloudflare - if cookie_lock: - self.cookie_lock = cookie_lock - else: - self.cookie_lock = COOKIEWRITELOCK - - self.use_socks = use_socks - # Override the global default socket timeout, so hung connections will actually time out properly. - socket.setdefaulttimeout(5) - - self.log = logging.getLogger(logPath) - # print("Webget init! Logpath = ", logPath) - if creds: - print("Have creds for a domain") - - # Due to general internet people douchebaggyness, I've basically said to hell with it and decided to spoof a whole assortment of browsers - # It should keep people from blocking this scraper *too* easily - self.browserHeaders = Constants.getUserAgent() - - self.data = urllib.parse.urlencode(self.browserHeaders) - - if creds: - print("Have credentials, installing password manager into urllib handler.") - passManager = urllib.request.HTTPPasswordMgrWithDefaultRealm() - for url, username, password in creds: - passManager.add_password(None, url, username, password) - self.credHandler = Handlers.PreemptiveBasicAuthHandler(passManager) - else: - self.credHandler = None - - self.alt_cookiejar = alt_cookiejar - self.__loadCookies() - def chunkReport(self, bytesSoFar, totalSize): - if totalSize: - percent = float(bytesSoFar) / totalSize - percent = round(percent * 100, 2) - self.log.info("Downloaded %d of %d bytes (%0.2f%%)" % (bytesSoFar, totalSize, percent)) - else: - self.log.info("Downloaded %d bytes" % (bytesSoFar)) - - def __chunkRead(self, response, chunkSize=2 ** 18, reportHook=None): - contentLengthHeader = response.info().getheader('Content-Length') - if contentLengthHeader: - totalSize = contentLengthHeader.strip() - totalSize = int(totalSize) - else: - totalSize = None - bytesSoFar = 0 - pgContent = "" - while 1: - chunk = response.read(chunkSize) - pgContent += chunk - bytesSoFar += len(chunk) - - if not chunk: - break - - if reportHook: - reportHook(bytesSoFar, chunkSize, totalSize) - - return pgContent - - def getSoupNoRedirects(self, *args, **kwargs): - if 'returnMultiple' in kwargs: - raise Exceptions.ArgumentError("getSoup cannot be called with 'returnMultiple'") - - if 'soup' in kwargs and kwargs['soup']: - raise Exceptions.ArgumentError("getSoup contradicts the 'soup' directive!") - - kwargs['returnMultiple'] = True - - tgt_url = kwargs.get('requestedUrl', None) - if not tgt_url: - tgt_url = args[0] - - - page, handle = self.getpage(*args, **kwargs) - - redirurl = handle.geturl() - if redirurl != tgt_url: - self.log.error("Requested %s, redirected to %s. Raising error", tgt_url, redirurl) - - raise Exceptions.RedirectedError("Requested %s, redirected to %s" % ( - tgt_url, redirurl)) - - soup = as_soup(page) - return soup - - def getSoup(self, *args, **kwargs): - if 'returnMultiple' in kwargs and kwargs['returnMultiple']: - raise Exceptions.ArgumentError("getSoup cannot be called with 'returnMultiple' being true") - - if 'soup' in kwargs and kwargs['soup']: - raise Exceptions.ArgumentError("getSoup contradicts the 'soup' directive!") - - page = self.getpage(*args, **kwargs) - if isinstance(page, bytes): - raise Exceptions.ContentTypeError("Received content not decoded! Cannot parse!") - - soup = as_soup(page) - return soup - - def getJson(self, *args, **kwargs): - if 'returnMultiple' in kwargs and kwargs['returnMultiple']: - raise Exceptions.ArgumentError("getSoup cannot be called with 'returnMultiple' being true") - - attempts = 0 - while 1: - try: - page = self.getpage(*args, **kwargs) - if isinstance(page, bytes): - page = page.decode(determine_json_encoding(page)) - # raise ValueError("Received content not decoded! Cannot parse!") - - page = page.strip() - ret = json.loads(page) - return ret - except ValueError: - if attempts < 1: - attempts += 1 - self.log.error("JSON Parsing issue retreiving content from page!") - for line in traceback.format_exc().split("\n"): - self.log.error("%s", line.rstrip()) - self.log.error("Retrying!") - - # Scramble our current UA - self.browserHeaders = Constants.getUserAgent() - if self.alt_cookiejar: - self.cj.init_agent(new_headers=self.browserHeaders) - - time.sleep(self.retryDelay) - else: - self.log.error("JSON Parsing issue, and retries exhausted!") - # self.log.error("Page content:") - # self.log.error(page) - # with open("Error-ctnt-{}.json".format(time.time()), "w") as tmp_err_fp: - # tmp_err_fp.write(page) - raise - - def getFileAndName(self, *args, **kwargs): - if 'returnMultiple' in kwargs: - raise Exceptions.ArgumentError("getFileAndName cannot be called with 'returnMultiple'") - - if 'soup' in kwargs and kwargs['soup']: - raise Exceptions.ArgumentError("getFileAndName contradicts the 'soup' directive!") - - kwargs["returnMultiple"] = True - - pgctnt, pghandle = self.getpage(*args, **kwargs) - - info = pghandle.info() - if not 'Content-Disposition' in info: - hName = '' - elif not 'filename=' in info['Content-Disposition']: - hName = '' - else: - hName = info['Content-Disposition'].split('filename=')[1] - - - return pgctnt, hName - - def getFileNameMime(self, *args, **kwargs): - if 'returnMultiple' in kwargs: - raise Exceptions.ArgumentError("getFileAndName cannot be called with 'returnMultiple'") - - if 'soup' in kwargs and kwargs['soup']: - raise Exceptions.ArgumentError("getFileAndName contradicts the 'soup' directive!") - - kwargs["returnMultiple"] = True - - pgctnt, pghandle = self.getpage(*args, **kwargs) - - info = pghandle.info() - if not 'Content-Disposition' in info: - hName = '' - elif not 'filename=' in info['Content-Disposition']: - hName = '' - else: - hName = info['Content-Disposition'].split('filename=')[1] - - mime = info.get_content_type() - - return pgctnt, hName, mime - - def getpage(self, requestedUrl, **kwargs): - self.log.info("Fetching content at URL: %s", requestedUrl) - - # strip trailing and leading spaces. - requestedUrl = requestedUrl.strip() - - # If we have 'soup' as a param, just pop it, and call `getSoup()`. - if 'soup' in kwargs and kwargs['soup']: - self.log.warning("'soup' kwarg is depreciated. Please use the `getSoup()` call instead.") - kwargs.pop('soup') - return self.getSoup(requestedUrl, **kwargs) - - # Decode the kwargs values - addlHeaders = kwargs.setdefault("addlHeaders", None) - returnMultiple = kwargs.setdefault("returnMultiple", False) - callBack = kwargs.setdefault("callBack", None) - postData = kwargs.setdefault("postData", None) - retryQuantity = kwargs.setdefault("retryQuantity", None) - nativeError = kwargs.setdefault("nativeError", False) - binaryForm = kwargs.setdefault("binaryForm", False) - - # Conditionally encode the referrer if needed, because otherwise - # urllib will barf on unicode referrer values. - if addlHeaders and 'Referer' in addlHeaders: - addlHeaders['Referer'] = iri2uri.iri2uri(addlHeaders['Referer']) - - - retryCount = 0 - while 1: - - pgctnt = None - pghandle = None - - pgreq = self.__buildRequest(requestedUrl, postData, addlHeaders, binaryForm) - - errored = False - lastErr = "" - - retryCount = retryCount + 1 - - if (retryQuantity and retryCount > retryQuantity) or (not retryQuantity and retryCount > self.errorOutCount): - self.log.error("Failed to retrieve Website : %s at %s All Attempts Exhausted", pgreq.get_full_url(), time.ctime(time.time())) - pgctnt = None - try: - self.log.critical("Critical Failure to retrieve page! %s at %s, attempt %s", pgreq.get_full_url(), time.ctime(time.time()), retryCount) - self.log.critical("Error: %s", lastErr) - self.log.critical("Exiting") - except: - self.log.critical("And the URL could not be printed due to an encoding error") - break - - #print "execution", retryCount - try: - # print("Getpage!", requestedUrl, kwargs) - pghandle = self.opener.open(pgreq, timeout=30) # Get Webpage - # print("Gotpage") - - except urllib.error.HTTPError as e: # Lotta logging - self.log.warning("Error opening page: %s at %s On Attempt %s.", pgreq.get_full_url(), time.ctime(time.time()), retryCount) - self.log.warning("Error Code: %s", e) - - #traceback.print_exc() - lastErr = e - try: - - self.log.warning("Original URL: %s", requestedUrl) - errored = True - except: - self.log.warning("And the URL could not be printed due to an encoding error") - - if e.code == 404: - #print "Unrecoverable - Page not found. Breaking" - self.log.critical("Unrecoverable - Page not found. Breaking") - break - - time.sleep(self.retryDelay) - if e.code == 503: - errcontent = e.read() - if b'This process is automatic. Your browser will redirect to your requested content shortly.' in errcontent: - self.log.warning("Cloudflare failure! Doing automatic step-through.") - self.stepThroughCloudFlare(requestedUrl, titleNotContains="Just a moment...") - except UnicodeEncodeError: - self.log.critical("Unrecoverable Unicode issue retreiving page - %s", requestedUrl) - for line in traceback.format_exc().split("\n"): - self.log.critical("%s", line.rstrip()) - self.log.critical("Parameters:") - self.log.critical(" requestedUrl: '%s'", requestedUrl) - self.log.critical(" postData: '%s'", postData) - self.log.critical(" addlHeaders: '%s'", addlHeaders) - self.log.critical(" binaryForm: '%s'", binaryForm) - - break - - except Exception: - errored = True - #traceback.print_exc() - lastErr = sys.exc_info() - self.log.warning("Retreival failed. Traceback:") - self.log.warning(str(lastErr)) - self.log.warning(traceback.format_exc()) - - self.log.warning("Error Retrieving Page! - Trying again - Waiting %s seconds", self.retryDelay) - - try: - self.log.critical("Error on page - %s", requestedUrl) - except: - self.log.critical("And the URL could not be printed due to an encoding error") - - time.sleep(self.retryDelay) - - continue - - if pghandle != None: - self.log.info("Request for URL: %s succeeded at %s On Attempt %s. Recieving...", pgreq.get_full_url(), time.ctime(time.time()), retryCount) - pgctnt = self.__retreiveContent(pgreq, pghandle, callBack) - - # if __retreiveContent did not return false, it managed to fetch valid results, so break - if pgctnt != False: - break - - if errored and pghandle != None: - print(("Later attempt succeeded %s" % pgreq.get_full_url())) - elif (errored or not pgctnt) and pghandle is None: - - if lastErr and nativeError: - raise lastErr - raise Exceptions.FetchFailureError("Failed to retreive page '%s'!" % (requestedUrl, )) - - if returnMultiple: - - return pgctnt, pghandle - else: - return pgctnt - - def getItem(self, itemUrl): - - try: - content, handle = self.getpage(itemUrl, returnMultiple=True) - except: - print("Failure?") - if self.rules['cloudflare']: - if not self.stepThroughCloudFlare(itemUrl, titleNotContains='Just a moment...'): - raise Exceptions.FetchFailureError("Could not step through cloudflare!") - # Cloudflare cookie set, retrieve again - content, handle = self.getpage(itemUrl, returnMultiple=True) - else: - raise - - if not content or not handle: - raise urllib.error.URLError("Failed to retreive file from page '%s'!" % itemUrl) - - fileN = urllib.parse.unquote(urllib.parse.urlparse(handle.geturl())[2].split("/")[-1]) - fileN = bs4.UnicodeDammit(fileN).unicode_markup - mType = handle.info()['Content-Type'] - - # If there is an encoding in the content-type (or any other info), strip it out. - # We don't care about the encoding, since WebFunctions will already have handled that, - # and returned a decoded unicode object. - if mType and ";" in mType: - mType = mType.split(";")[0].strip() - - # *sigh*. So minus.com is fucking up their http headers, and apparently urlencoding the - # mime type, because apparently they're shit at things. - # Anyways, fix that. - if '%2F' in mType: - mType = mType.replace('%2F', '/') - - self.log.info("Retreived file of type '%s', name of '%s' with a size of %0.3f K", mType, fileN, len(content)/1000.0) - return content, fileN, mType - - def getHead(self, url, addlHeaders=None): - for x in range(9999): - try: - self.log.info("Doing HTTP HEAD request for '%s'", url) - pgreq = self.__buildRequest(url, None, addlHeaders, None, req_class=Handlers.HeadRequest) - pghandle = self.opener.open(pgreq, timeout=30) - returl = pghandle.geturl() - if returl != url: - self.log.info("HEAD request returned a different URL '%s'", returl) - - return returl - except socket.timeout as e: - self.log.info("Timeout, retrying....") - if x >= 3: - self.log.error("Failure fetching: %s", url) - raise Exceptions.FetchFailureError("Timout when fetching %s. Error: %s" % (url, e)) - except urllib.error.URLError as e: - # Continue even in the face of cloudflare crapping it's pants - if e.code == 500 and e.geturl(): - return e.geturl() - self.log.info("URLError, retrying....") - if x >= 3: - self.log.error("Failure fetching: %s", url) - raise Exceptions.FetchFailureError("URLError when fetching %s. Error: %s" % (url, e)) - - ###################################################################################################################################################### - ###################################################################################################################################################### - - def __decodeHtml(self, pageContent, cType): - - # this *should* probably be done using a parser. - # However, it seems to be grossly overkill to shove the whole page (which can be quite large) through a parser just to pull out a tag that - # should be right near the page beginning anyways. - # As such, it's a regular expression for the moment - - # Regex is of bytes type, since we can't convert a string to unicode until we know the encoding the - # bytes string is using, and we need the regex to get that encoding - coding = re.search(rb"charset=[\'\"]?([a-zA-Z0-9\-]*)[\'\"]?", pageContent, flags=re.IGNORECASE) - - cType = b"" - charset = None - try: - if coding: - cType = coding.group(1) - codecs.lookup(cType.decode("ascii")) - charset = cType.decode("ascii") - - except LookupError: - - # I'm actually not sure what I was thinking when I wrote this if statement. I don't think it'll ever trigger. - if (b";" in cType) and (b"=" in cType): # the server is reporting an encoding. Now we use it to decode the - - dummy_docType, charset = cType.split(b";") - charset = charset.split(b"=")[-1] - - if not charset: - self.log.warning("Could not find encoding information on page - Using default charset. Shit may break!") - charset = "iso-8859-1" - - try: - pageContent = str(pageContent, charset) - - except UnicodeDecodeError: - self.log.error("Encoding Error! Stripping invalid chars.") - pageContent = pageContent.decode('utf-8', errors='ignore') - - return pageContent - - def __buildRequest(self, pgreq, postData, addlHeaders, binaryForm, req_class = None): - if req_class is None: - req_class = urllib.request.Request - - pgreq = iri2uri.iri2uri(pgreq) - - try: - params = {} - headers = {} - if postData != None: - self.log.info("Making a post-request! Params: '%s'", postData) - params['data'] = urllib.parse.urlencode(postData).encode("utf-8") - if addlHeaders != None: - self.log.info("Have additional GET parameters!") - for key, parameter in addlHeaders.items(): - self.log.info(" Item: '%s' -> '%s'", key, parameter) - headers = addlHeaders - if binaryForm: - self.log.info("Binary form submission!") - if 'data' in params: - raise Exceptions.ArgumentError("You cannot make a binary form post and a plain post request at the same time!") - - params['data'] = binaryForm.make_result() - headers['Content-type'] = binaryForm.get_content_type() - headers['Content-length'] = len(params['data']) - - return req_class(pgreq, headers=headers, **params) - - except: - self.log.critical("Invalid header or url") - raise - - def __decompressContent(self, coding, pgctnt): - #preLen = len(pgctnt) - if coding == 'deflate': - compType = "deflate" - - pgctnt = zlib.decompress(pgctnt, -zlib.MAX_WBITS) - - elif coding == 'gzip': - compType = "gzip" - - buf = io.BytesIO(pgctnt) - f = gzip.GzipFile(fileobj=buf) - pgctnt = f.read() - - elif coding == "sdch": - raise Exceptions.ContentTypeError("Wait, someone other then google actually supports SDCH compression?") - - else: - compType = "none" - - return compType, pgctnt - - def __decodeTextContent(self, pgctnt, cType): - - if cType: - if (";" in cType) and ("=" in cType): - # the server is reporting an encoding. Now we use it to decode the content - # Some wierdos put two charsets in their headers: - # `text/html;Charset=UTF-8;charset=UTF-8` - # Split, and take the first two entries. - docType, charset = cType.split(";")[:2] - charset = charset.split("=")[-1] - - # Only decode content marked as text (yeah, google is serving zip files - # with the content-disposition charset header specifying "UTF-8") or - # specifically allowed other content types I know are really text. - decode = ['application/atom+xml', 'application/xml', "application/json", 'text'] - if any([item in docType for item in decode]): - try: - pgctnt = str(pgctnt, charset) - except UnicodeDecodeError: - self.log.error("Encoding Error! Stripping invalid chars.") - pgctnt = pgctnt.decode('utf-8', errors='ignore') - - else: - # The server is not reporting an encoding in the headers. - # Use content-aware mechanisms for determing the content encoding. - - - if "text/html" in cType or \ - 'text/javascript' in cType or \ - 'text/css' in cType or \ - 'application/xml' in cType or \ - 'application/atom+xml' in cType: # If this is a html/text page, we want to decode it using the local encoding - - pgctnt = self.__decodeHtml(pgctnt, cType) - - elif "text/plain" in cType or "text/xml" in cType: - pgctnt = bs4.UnicodeDammit(pgctnt).unicode_markup - - # Assume JSON is utf-8. Probably a bad idea? - elif "application/json" in cType: - pgctnt = pgctnt.decode('utf-8') - - elif "text" in cType: - self.log.critical("Unknown content type!") - self.log.critical(cType) - - else: - self.log.critical("No content disposition header!") - self.log.critical("Cannot guess content type!") - - return pgctnt - - def __retreiveContent(self, pgreq, pghandle, callBack): - try: - # If we have a progress callback, call it for chunked read. - # Otherwise, just read in the entire content. - if callBack: - pgctnt = self.__chunkRead(pghandle, 2 ** 17, reportHook=callBack) - else: - pgctnt = pghandle.read() - - - if pgctnt is None: - return False - - self.log.info("URL fully retrieved.") - - preDecompSize = len(pgctnt)/1000.0 - - encoded = pghandle.headers.get('Content-Encoding') - compType, pgctnt = self.__decompressContent(encoded, pgctnt) - - - decompSize = len(pgctnt)/1000.0 - # self.log.info("Page content type = %s", type(pgctnt)) - cType = pghandle.headers.get("Content-Type") - if compType == 'none': - self.log.info("Compression type = %s. Content Size = %0.3fK. File type: %s.", compType, decompSize, cType) - else: - self.log.info("Compression type = %s. Content Size compressed = %0.3fK. Decompressed = %0.3fK. File type: %s.", compType, preDecompSize, decompSize, cType) - - pgctnt = self.__decodeTextContent(pgctnt, cType) - - return pgctnt - - except: - print("pghandle = ", pghandle) - - self.log.error(sys.exc_info()) - traceback.print_exc() - self.log.error("Error Retrieving Page! - Transfer failed. Waiting %s seconds before retrying", self.retryDelay) - - try: - self.log.critical("Critical Failure to retrieve page! %s at %s", pgreq.get_full_url(), time.ctime(time.time())) - self.log.critical("Exiting") - except: - self.log.critical("And the URL could not be printed due to an encoding error") - print() - self.log.error(pghandle) - time.sleep(self.retryDelay) - - return False - - - # HUGE GOD-FUNCTION. - # OH GOD FIXME. - - # postData expects a dict - # addlHeaders also expects a dict - - ###################################################################################################################################################### - ###################################################################################################################################################### - - def __loadCookies(self): - - if self.alt_cookiejar is not None: - self.alt_cookiejar.init_agent(new_headers=self.browserHeaders) - self.cj = self.alt_cookiejar - else: - self.cj = http.cookiejar.LWPCookieJar() # This is a subclass of FileCookieJar - # that has useful load and save methods - if self.cj is not None: - if os.path.isfile(self.COOKIEFILE): - try: - self.__updateCookiesFromFile() - # self.log.info("Loading CookieJar") - except: - self.log.critical("Cookie file is corrupt/damaged?") - try: - os.remove(self.COOKIEFILE) - except FileNotFoundError: - pass - if http.cookiejar is not None: - # self.log.info("Installing CookieJar") - self.log.debug(self.cj) - cookieHandler = urllib.request.HTTPCookieProcessor(self.cj) - args = (cookieHandler, Handlers.HTTPRedirectHandler) - if self.credHandler: - print("Have cred handler. Building opener using it") - args += (self.credHandler, ) - if self.use_socks: - print("Using Socks handler") - if not HAVE_SOCKS: - raise RuntimeError("SOCKS Use specified, and no socks installed!") - args = (SocksiPyHandler(socks.SOCKS5, "127.0.0.1", 9050), ) + args - - self.opener = urllib.request.build_opener(*args) - #self.opener.addheaders = [('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')] - self.opener.addheaders = self.browserHeaders - #urllib2.install_opener(self.opener) - - for cookie in self.cj: - self.log.debug(cookie) - #print cookie - - def __syncCookiesFromFile(self): - # self.log.info("Synchronizing cookies with cookieFile.") - if os.path.isfile(self.COOKIEFILE): - self.cj.save("cookietemp.lwp") - self.cj.load(self.COOKIEFILE) - self.cj.load("cookietemp.lwp") - # First, load any changed cookies so we don't overwrite them - # However, we want to persist any cookies that we have that are more recent then the saved cookies, so we temporarily save - # the cookies in memory to a temp-file, then load the cookiefile, and finally overwrite the loaded cookies with the ones from the - # temp file - - def __updateCookiesFromFile(self): - if os.path.exists(self.COOKIEFILE): - # self.log.info("Synchronizing cookies with cookieFile.") - self.cj.load(self.COOKIEFILE) - # Update cookies from cookiefile - - def addCookie(self, inCookie): - self.log.info("Updating cookie!") - self.cj.set_cookie(inCookie) - - def saveCookies(self, halting=False): - - locked = self.cookie_lock.acquire(timeout=5) - if not locked: - self.log.error("Failed to acquire cookie-lock!") - return - - # print("Have %d cookies before saving cookiejar" % len(self.cj)) - try: - # self.log.info("Trying to save cookies!") - if self.cj is not None: # If cookies were used - - self.__syncCookiesFromFile() - - # self.log.info("Have cookies to save") - for cookie in self.cj: - # print(cookie) - # print(cookie.expires) - - if isinstance(cookie.expires, int) and cookie.expires > 30000000000: # Clamp cookies that expire stupidly far in the future because people are assholes - cookie.expires = 30000000000 - - # self.log.info("Calling save function") - self.cj.save(self.COOKIEFILE) # save the cookies again - - - # self.log.info("Cookies Saved") - else: - self.log.info("No cookies to save?") - except Exception as e: - pass - # The destructor call order is too incoherent, and shit fails - # during the teardown with null-references. The error printout is - # not informative, so just silence it. - # print("Possible error on exit (or just the destructor): '%s'." % e) - finally: - self.cookie_lock.release() - - # print("Have %d cookies after saving cookiejar" % len(self.cj)) - if not halting: - self.__syncCookiesFromFile() - # print "Have %d cookies after reloading cookiejar" % len(self.cj) - - def getCookies(self): - - locked = self.cookie_lock.acquire(timeout=5) - if not locked: - raise RuntimeError("Could not acquire lock on cookiejar") - - try: - # self.log.info("Trying to save cookies!") - if self.cj is not None: # If cookies were used - self.__syncCookiesFromFile() - finally: - self.cookie_lock.release() - - return self.cj - - ###################################################################################################################################################### - ###################################################################################################################################################### - - def __del__(self): - # print "WGH Destructor called!" - # print("WebRequest __del__") - self.saveCookies(halting=True) - - sup = super() - if hasattr(sup, '__del__'): - sup.__del__() - - - - - def stepThroughCloudFlare(self, *args, **kwargs): - # Shim to the underlying web browser of choice - - self.stepThroughCloudFlare_pjs(*args, **kwargs) - - diff --git a/util/WebRequest/__init__.py b/util/WebRequest/__init__.py deleted file mode 100644 index c465596..0000000 --- a/util/WebRequest/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ - -from .WebRequestClass import as_soup -from .WebRequestClass import WebGetRobust - -from .Exceptions import WebGetException -from .Exceptions import ContentTypeError -from .Exceptions import ArgumentError -from .Exceptions import RedirectedError -from .Exceptions import FetchFailureError - diff --git a/util/WebRequest/iri2uri.py b/util/WebRequest/iri2uri.py deleted file mode 100644 index 07a4a59..0000000 --- a/util/WebRequest/iri2uri.py +++ /dev/null @@ -1,75 +0,0 @@ - -import urllib.parse - -# Convert an IRI to a URI following the rules in RFC 3987 -# -# The characters we need to enocde and escape are defined in the spec: -# -# iprivate = %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD -# ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF -# / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD -# / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD -# / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD -# / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD -# / %xD0000-DFFFD / %xE1000-EFFFD - -escape_range = [ - (0xA0, 0xD7FF), - (0xE000, 0xF8FF), - (0xF900, 0xFDCF), - (0xFDF0, 0xFFEF), - (0x10000, 0x1FFFD), - (0x20000, 0x2FFFD), - (0x30000, 0x3FFFD), - (0x40000, 0x4FFFD), - (0x50000, 0x5FFFD), - (0x60000, 0x6FFFD), - (0x70000, 0x7FFFD), - (0x80000, 0x8FFFD), - (0x90000, 0x9FFFD), - (0xA0000, 0xAFFFD), - (0xB0000, 0xBFFFD), - (0xC0000, 0xCFFFD), - (0xD0000, 0xDFFFD), - (0xE1000, 0xEFFFD), - (0xF0000, 0xFFFFD), - (0x100000, 0x10FFFD), -] - -def encode(c): - retval = c - i = ord(c) - for low, high in escape_range: - if i < low: - break - if i >= low and i <= high: - retval = "".join(["%%%2X" % o for o in c.encode('utf-8')]) - break - return retval - - -def iri2uri(uri): - """Convert an IRI to a URI. Note that IRIs must be - passed in a unicode strings. That is, do not utf-8 encode - the IRI before passing it into the function.""" - - assert uri != None, 'iri2uri must be passed a non-none string!' - - original = uri - if isinstance(uri ,str): - (scheme, authority, path, query, fragment) = urllib.parse.urlsplit(uri) - authority = authority.encode('idna').decode('utf-8') - # For each character in 'ucschar' or 'iprivate' - # 1. encode as utf-8 - # 2. then %-encode each octet of that utf-8 - path = urllib.parse.quote(path) - uri = urllib.parse.urlunsplit((scheme, authority, path, query, fragment)) - uri = "".join([encode(c) for c in uri]) - - # urllib.parse.urlunsplit(urllib.parse.urlsplit({something}) - # strips any trailing "?" chars. While this may be legal according to the - # spec, it breaks some services. Therefore, we patch - # the "?" back in if it has been removed. - if original.endswith("?") and not uri.endswith("?"): - uri = uri+"?" - return uri diff --git a/util/WebRequest/tests/__init__.py b/util/WebRequest/tests/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/util/WebRequest/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/util/WebRequest/tests/test_chromium.py b/util/WebRequest/tests/test_chromium.py deleted file mode 100644 index 1331d0e..0000000 --- a/util/WebRequest/tests/test_chromium.py +++ /dev/null @@ -1,192 +0,0 @@ -import unittest -import socket -import json -import base64 -import zlib -import gzip -import bs4 -import ChromeController -from http.server import BaseHTTPRequestHandler, HTTPServer -from threading import Thread - -import util.WebRequest as WebRequest - - -class MockServerRequestHandler(BaseHTTPRequestHandler): - def do_GET(self): - # Process an HTTP GET request and return a response with an HTTP 200 status. - print("Path: ", self.path) - - if self.path == "/": - self.send_response(200) - self.send_header('Content-type', "text/html") - self.end_headers() - self.wfile.write(b"Root OK?") - - if self.path == "/with_title_1": - self.send_response(200) - self.send_header('Content-type', "text/html") - self.end_headers() - self.wfile.write(b"Page Title 1Root OK?") - - elif self.path == "/raw-txt": - self.send_response(200) - self.send_header('Content-type', "text/plain") - self.end_headers() - self.wfile.write(b"Root OK?") - - elif self.path == "/binary_ctnt": - self.send_response(200) - self.send_header('Content-type', "image/jpeg") - self.end_headers() - self.wfile.write(b"Binary!\x00\x01\x02\x03") - - elif self.path == "/redirect/bad-1": - self.send_response(302) - self.end_headers() - - elif self.path == "/redirect/bad-2": - self.send_response(302) - self.send_header('location', "bad-2") - self.end_headers() - - elif self.path == "/redirect/bad-3": - self.send_response(302) - self.send_header('location', "gopher://www.google.com") - self.end_headers() - - elif self.path == "/redirect/from-1": - self.send_response(302) - self.send_header('location', "to-1") - self.end_headers() - - if self.path == "/redirect/to-1": - self.send_response(200) - self.end_headers() - self.wfile.write(b"Redirect-To-1") - - elif self.path == "/redirect/from-2": - self.send_response(302) - self.send_header('uri', "to-2") - self.end_headers() - - if self.path == "/redirect/to-2": - self.send_response(200) - self.end_headers() - self.wfile.write(b"Redirect-To-2") - - elif self.path == "/redirect/from-3": - self.send_response(302) - newurl = "http://{}:{}".format(self.server.server_address[0], self.server.server_address[1]) - self.send_header('uri', newurl) - self.end_headers() - - -def get_free_port(): - s = socket.socket(socket.AF_INET, type=socket.SOCK_STREAM) - s.bind(('localhost', 0)) - address, port = s.getsockname() - s.close() - return port - - -class TestChromium(unittest.TestCase): - def setUp(self): - - # Configure mock server. - self.mock_server_port = get_free_port() - self.mock_server = HTTPServer(('localhost', self.mock_server_port), MockServerRequestHandler) - - # Start running mock server in a separate thread. - # Daemon threads automatically shut down when the main process exits. - self.mock_server_thread = Thread(target=self.mock_server.serve_forever) - self.mock_server_thread.setDaemon(True) - self.mock_server_thread.start() - self.wg = WebRequest.WebGetRobust() - - def tearDown(self): - self.mock_server.shutdown() - - # Hacky force-close of the chromium interface - self.wg.close_chromium() - del self.wg - - def test_fetch_1(self): - page = self.wg.getpage("http://localhost:{}".format(self.mock_server_port)) - self.assertEqual(page, 'Root OK?') - - def test_fetch_chromium_1(self): - page, fname, mtype = self.wg.getItemChromium("http://localhost:{}".format(self.mock_server_port)) - - self.assertEqual(fname, '') - self.assertEqual(mtype, 'text/html') - self.assertEqual(page, 'Root OK?') - - def test_fetch_chromium_2(self): - page, fname, mtype = self.wg.getItemChromium("http://localhost:{}/raw-txt".format(self.mock_server_port)) - self.assertEqual(fname, 'raw-txt') - self.assertEqual(mtype, 'text/html') # I'm not properly retreiving the mimetype from chromium - self.assertEqual(page, 'Root OK?') - - def test_fetch_chromium_3(self): - page, fname, mtype = self.wg.getItemChromium("http://localhost:{}/binary_ctnt".format(self.mock_server_port)) - self.assertEqual(fname, 'binary_ctnt') - self.assertEqual(mtype, 'application/x-binary') - self.assertEqual(page, b"Binary!\x00\x01\x02\x03") - - def test_head_chromium_1(self): - url_1 = "http://localhost:{}/raw-txt".format(self.mock_server_port) - purl_1 = self.wg.getHeadChromium(url_1) - self.assertEqual(purl_1, url_1) - - def test_head_chromium_2(self): - url_2 = "http://localhost:{}/redirect/to-1".format(self.mock_server_port) - purl_2 = self.wg.getHeadChromium("http://localhost:{}/redirect/from-1".format(self.mock_server_port)) - self.assertEqual(purl_2, url_2) - - def test_head_chromium_3(self): - url_3 = "http://localhost:{}/redirect/bad-1".format(self.mock_server_port) - purl_3 = self.wg.getHeadChromium("http://localhost:{}/redirect/bad-1".format(self.mock_server_port)) - self.assertEqual(purl_3, url_3) - - def test_head_chromium_4(self): - # Chromium changes infinite redirects into timeouts. - with self.assertRaises(ChromeController.ChromeNavigateTimedOut): - self.wg.getHeadChromium("http://localhost:{}/redirect/bad-2".format(self.mock_server_port)) - - def test_head_chromium_5(self): - # Chromium changes infinite redirects into timeouts. - with self.assertRaises(ChromeController.ChromeNavigateTimedOut): - self.wg.getHeadChromium("http://localhost:{}/redirect/bad-3".format(self.mock_server_port)) - - def test_head_title_chromium_1(self): - pg_url = "http://localhost:{}/with_title_1".format(self.mock_server_port) - retreived = self.wg.getHeadTitleChromium(pg_url) - - expect = { - 'url': pg_url, - 'title': 'Page Title 1', - } - self.assertEqual(retreived, expect) - - def test_head_title_chromium_2(self): - pg_url = "http://localhost:{}/".format(self.mock_server_port) - retreived = self.wg.getHeadTitleChromium(pg_url) - - expect = { - # If no title is specified, chromium returns the server URL - 'url': pg_url, - 'title': 'localhost:{}'.format(self.mock_server_port), - } - self.assertEqual(retreived, expect) - - def test_head_title_chromium_3(self): - pg_url = "http://localhost:{}/binary_ctnt".format(self.mock_server_port) - retreived = self.wg.getHeadTitleChromium(pg_url) - - expect = { - # If no title is specified, chromium returns the server URL - 'url': pg_url, - 'title': 'localhost:{}/binary_ctnt'.format(self.mock_server_port), - } - self.assertEqual(retreived, expect) diff --git a/util/WebRequest/tests/test_phantom.py b/util/WebRequest/tests/test_phantom.py deleted file mode 100644 index cce440f..0000000 --- a/util/WebRequest/tests/test_phantom.py +++ /dev/null @@ -1,144 +0,0 @@ -import unittest -import socket -import json -import base64 -import zlib -import gzip -import bs4 -from http.server import BaseHTTPRequestHandler, HTTPServer -from threading import Thread - -import util.WebRequest as WebRequest - - -class MockServerRequestHandler(BaseHTTPRequestHandler): - def do_GET(self): - # Process an HTTP GET request and return a response with an HTTP 200 status. - print("Path: ", self.path) - - if self.path == "/": - self.send_response(200) - self.send_header('Content-type', "text/html") - self.end_headers() - self.wfile.write(b"Root OK?") - - elif self.path == "/raw-txt": - self.send_response(200) - self.send_header('Content-type', "text/plain") - self.end_headers() - self.wfile.write(b"Root OK?") - - elif self.path == "/redirect/bad-1": - self.send_response(302) - self.end_headers() - - elif self.path == "/redirect/bad-2": - self.send_response(302) - self.send_header('location', "bad-2") - self.end_headers() - - elif self.path == "/redirect/bad-3": - self.send_response(302) - self.send_header('location', "gopher://www.google.com") - self.end_headers() - - elif self.path == "/redirect/from-1": - self.send_response(302) - self.send_header('location', "to-1") - self.end_headers() - - if self.path == "/redirect/to-1": - self.send_response(200) - self.end_headers() - self.wfile.write(b"Redirect-To-1") - - elif self.path == "/redirect/from-2": - self.send_response(302) - self.send_header('uri', "to-2") - self.end_headers() - - if self.path == "/redirect/to-2": - self.send_response(200) - self.end_headers() - self.wfile.write(b"Redirect-To-2") - - elif self.path == "/redirect/from-3": - self.send_response(302) - newurl = "http://{}:{}".format(self.server.server_address[0], self.server.server_address[1]) - self.send_header('uri', newurl) - self.end_headers() - - -def get_free_port(): - s = socket.socket(socket.AF_INET, type=socket.SOCK_STREAM) - s.bind(('localhost', 0)) - address, port = s.getsockname() - s.close() - return port - - -class TestPhantomJS(unittest.TestCase): - def setUp(self): - - # Configure mock server. - self.mock_server_port = get_free_port() - self.mock_server = HTTPServer(('localhost', self.mock_server_port), MockServerRequestHandler) - - # Start running mock server in a separate thread. - # Daemon threads automatically shut down when the main process exits. - self.mock_server_thread = Thread(target=self.mock_server.serve_forever) - self.mock_server_thread.setDaemon(True) - self.mock_server_thread.start() - self.wg = WebRequest.WebGetRobust() - - def tearDown(self): - self.mock_server.shutdown() - - def test_fetch_1(self): - page = self.wg.getpage("http://localhost:{}".format(self.mock_server_port)) - self.assertEqual(page, 'Root OK?') - - def test_fetch_pjs(self): - page_1, fname_1, mtype_1 = self.wg.getItemPhantomJS("http://localhost:{}".format(self.mock_server_port)) - # I think all this garbage is phantomjs/selenium deciding they know what I want the content to look like for me. - # Note that the content isn't specified to be HTML ANYWHERE. - self.assertEqual(page_1, 'Root OK?') - - # Because PJS is retarded, it ALWAYS wraps content in html shit unless you specify the content is "text/html". If you do that, it then proceds to only - # add /some/ of the html tag garbage - page_2, fname_2, mtype_2 = self.wg.getItemPhantomJS("http://localhost:{}/raw-txt".format(self.mock_server_port)) - # I think all this garbage is phantomjs/selenium deciding they know what I want the content to look like for me. - # Note that the content isn't specified to be HTML ANYWHERE. - self.assertEqual( - page_2, - '
Root OK?
' - ) - - def test_head_pjs_1(self): - url_1 = "http://localhost:{}/raw-txt".format(self.mock_server_port) - purl_1 = self.wg.getHeadPhantomJS(url_1) - self.assertEqual(purl_1, url_1) - - url_2 = "http://localhost:{}/redirect/to-1".format(self.mock_server_port) - purl_2 = self.wg.getHeadPhantomJS("http://localhost:{}/redirect/from-1".format(self.mock_server_port)) - self.assertEqual(purl_2, url_2) - - # We expect to get the same value as passed, since pjs will not resolve out - # the bad redirects. - # Note we have to restart phantomjs for these tests, because otherwise it remembers state (this is why they're separate tests). - def test_head_pjs_2(self): - url_3 = "http://localhost:{}/redirect/bad-1".format(self.mock_server_port) - purl_3 = self.wg.getHeadPhantomJS("http://localhost:{}/redirect/bad-1".format(self.mock_server_port)) - self.assertEqual(purl_3, url_3) - - def test_head_pjs_3(self): - # Somehow, this turns into 'about:blank'. NFI how - url_4 = "about:blank" - purl_4 = self.wg.getHeadPhantomJS("http://localhost:{}/redirect/bad-2".format(self.mock_server_port)) - self.assertEqual(purl_4, url_4) - - def test_head_pjs_4(self): - # Somehow, this turns into 'about:blank'. NFI how - url_5 = "about:blank" - purl_5 = self.wg.getHeadPhantomJS("http://localhost:{}/redirect/bad-3".format(self.mock_server_port)) - self.assertEqual(purl_5, url_5) diff --git a/util/WebRequest/tests/test_simple.py b/util/WebRequest/tests/test_simple.py deleted file mode 100644 index e1d11d9..0000000 --- a/util/WebRequest/tests/test_simple.py +++ /dev/null @@ -1,330 +0,0 @@ -import unittest -import socket -import json -import base64 -import zlib -import gzip -import bs4 -from http.server import BaseHTTPRequestHandler, HTTPServer -from threading import Thread - -import util.WebRequest as WebRequest - - -class TestPlainCreation(unittest.TestCase): - def test_plain_instantiation_1(self): - wg = WebRequest.WebGetRobust() - self.assertTrue(wg is not None) - - def test_plain_instantiation_2(self): - wg = WebRequest.WebGetRobust(cloudflare=True) - self.assertTrue(wg is not None) - - def test_plain_instantiation_3(self): - wg = WebRequest.WebGetRobust(use_socks=True) - self.assertTrue(wg is not None) - - -class MockServerRequestHandler(BaseHTTPRequestHandler): - def do_GET(self): - # Process an HTTP GET request and return a response with an HTTP 200 status. - print("Path: ", self.path) - - if self.path == "/": - self.send_response(200) - self.end_headers() - self.wfile.write(b"Root OK?") - - elif self.path == "/html-decode": - self.send_response(200) - self.send_header('Content-type', "text/html") - self.end_headers() - self.wfile.write(b"Root OK?") - - elif self.path == "/html/real": - self.send_response(200) - self.send_header('Content-type', "text/html") - self.end_headers() - self.wfile.write(b"Root OK?") - - elif self.path == "/compressed/deflate": - self.send_response(200) - self.send_header('Content-Encoding', 'deflate') - self.send_header('Content-type', "text/html") - self.end_headers() - - inb = b"Root OK?" - cobj = zlib.compressobj(wbits=-zlib.MAX_WBITS) - t1 = cobj.compress(inb) + cobj.flush() - self.wfile.write(t1) - - elif self.path == "/compressed/gzip": - self.send_response(200) - self.send_header('Content-Encoding', 'gzip') - self.send_header('Content-type', "text/html") - self.end_headers() - self.wfile.write(gzip.compress(b"Root OK?")) - - elif self.path == "/json/invalid": - self.send_response(200) - self.send_header('Content-type', "text/html") - self.end_headers() - self.wfile.write(b"LOLWAT") - - elif self.path == "/json/valid": - self.send_response(200) - self.send_header('Content-type', "text/html") - self.end_headers() - self.wfile.write(b'{"oh" : "hai"}') - - elif self.path == "/json/no-coding": - self.send_response(200) - self.end_headers() - self.wfile.write(b'{"oh" : "hai"}') - - elif self.path == "/filename/path-only.txt": - self.send_response(200) - self.end_headers() - self.wfile.write(b"LOLWAT?") - elif self.path == "/filename/content-disposition": - self.send_response(200) - self.send_header('Content-Disposition', "filename=lolercoaster.txt") - self.end_headers() - self.wfile.write(b"LOLWAT?") - - elif self.path == "/filename_mime/path-only.txt": - self.send_response(200) - self.end_headers() - self.wfile.write(b"LOLWAT?") - - elif self.path == "/filename_mime/content-disposition": - self.send_response(200) - self.send_header('Content-Disposition', "filename=lolercoaster.txt") - self.end_headers() - self.wfile.write(b"LOLWAT?") - - elif self.path == "/filename_mime/content-disposition-html-suffix": - self.send_response(200) - self.send_header('Content-Disposition', "filename=lolercoaster.html") - self.end_headers() - self.wfile.write(b"LOLWAT?") - - elif self.path == "/filename_mime/explicit-html-mime": - self.send_response(200) - self.send_header('Content-Disposition', "filename=lolercoaster.html") - self.send_header('Content-type', "text/html") - self.end_headers() - self.wfile.write(b"LOLWAT?") - - elif self.path == "/redirect/bad-1": - self.send_response(302) - self.end_headers() - - elif self.path == "/redirect/bad-2": - self.send_response(302) - self.send_header('location', "bad-2") - self.end_headers() - - elif self.path == "/redirect/bad-3": - self.send_response(302) - self.send_header('location', "gopher://www.google.com") - self.end_headers() - - elif self.path == "/redirect/from-1": - self.send_response(302) - self.send_header('location', "to-1") - self.end_headers() - - if self.path == "/redirect/to-1": - self.send_response(200) - self.end_headers() - self.wfile.write(b"Redirect-To-1") - - elif self.path == "/redirect/from-2": - self.send_response(302) - self.send_header('uri', "to-2") - self.end_headers() - - if self.path == "/redirect/to-2": - self.send_response(200) - self.end_headers() - self.wfile.write(b"Redirect-To-2") - - elif self.path == "/redirect/from-3": - self.send_response(302) - newurl = "http://{}:{}".format(self.server.server_address[0], self.server.server_address[1]) - self.send_header('uri', newurl) - self.end_headers() - - elif self.path == "/password/expect": - - self.send_response(200) - self.end_headers() - - val = self.headers['Authorization'] - passval = val.split(" ")[-1] - passstr = base64.b64decode(passval) - - if passstr == b'lol:wat': - self.wfile.write(b"Password Ok?") - else: - self.wfile.write(b"Password Bad!") - - -def get_free_port(): - s = socket.socket(socket.AF_INET, type=socket.SOCK_STREAM) - s.bind(('localhost', 0)) - address, port = s.getsockname() - s.close() - return port - - -class TestSimpleFetch(unittest.TestCase): - def setUp(self): - - # Configure mock server. - self.mock_server_port = get_free_port() - self.mock_server = HTTPServer(('localhost', self.mock_server_port), MockServerRequestHandler) - - # Start running mock server in a separate thread. - # Daemon threads automatically shut down when the main process exits. - self.mock_server_thread = Thread(target=self.mock_server.serve_forever) - self.mock_server_thread.setDaemon(True) - self.mock_server_thread.start() - self.wg = WebRequest.WebGetRobust() - - def tearDown(self): - self.mock_server.shutdown() - - def test_fetch_1(self): - page = self.wg.getpage("http://localhost:{}".format(self.mock_server_port)) - self.assertEqual(page, b'Root OK?') - - def test_fetch_decode_1(self): - # text/html content should be decoded automatically. - page = self.wg.getpage("http://localhost:{}/html-decode".format(self.mock_server_port)) - self.assertEqual(page, 'Root OK?') - - def test_fetch_soup(self): - # text/html content should be decoded automatically. - page = self.wg.getSoup("http://localhost:{}/html/real".format(self.mock_server_port)) - self.assertEqual(page, bs4.BeautifulSoup('Root OK?', 'lxml')) - - page = self.wg.getSoup("http://localhost:{}/html-decode".format(self.mock_server_port)) - self.assertEqual(page, bs4.BeautifulSoup('

Root OK?

', 'lxml')) - - # getSoup fails to fetch content that's not of content-type text/html - with self.assertRaises(WebRequest.ContentTypeError): - page = self.wg.getSoup("http://localhost:{}/".format(self.mock_server_port)) - - def test_fetch_decode_json(self): - # text/html content should be decoded automatically. - page = self.wg.getJson("http://localhost:{}/json/valid".format(self.mock_server_port)) - self.assertEqual(page, {'oh': 'hai'}) - - page = self.wg.getJson("http://localhost:{}/json/no-coding".format(self.mock_server_port)) - self.assertEqual(page, {'oh': 'hai'}) - - with self.assertRaises(json.decoder.JSONDecodeError): - page = self.wg.getJson("http://localhost:{}/json/invalid".format(self.mock_server_port)) - - def test_fetch_compressed(self): - - page = self.wg.getpage("http://localhost:{}/compressed/gzip".format(self.mock_server_port)) - self.assertEqual(page, 'Root OK?') - - page = self.wg.getpage("http://localhost:{}/compressed/deflate".format(self.mock_server_port)) - self.assertEqual(page, 'Root OK?') - - def test_file_and_name(self): - page, fn = self.wg.getFileAndName("http://localhost:{}/filename/path-only.txt".format(self.mock_server_port)) - self.assertEqual(page, b'LOLWAT?') - self.assertEqual(fn, '') - - page, fn = self.wg.getFileAndName("http://localhost:{}/filename/content-disposition".format(self.mock_server_port)) - self.assertEqual(page, b'LOLWAT?') - self.assertEqual(fn, 'lolercoaster.txt') - - def test_file_name_mime(self): - page, fn, mimet = self.wg.getFileNameMime( - "http://localhost:{}/filename_mime/path-only.txt".format(self.mock_server_port)) - self.assertEqual(page, b'LOLWAT?') - self.assertEqual(fn, '') - self.assertEqual(mimet, 'text/plain') - - page, fn, mimet = self.wg.getFileNameMime( - "http://localhost:{}/filename_mime/content-disposition".format(self.mock_server_port)) - self.assertEqual(page, b'LOLWAT?') - self.assertEqual(fn, 'lolercoaster.txt') - self.assertEqual(mimet, 'text/plain') - - page, fn, mimet = self.wg.getFileNameMime( - "http://localhost:{}/filename_mime/content-disposition-html-suffix".format(self.mock_server_port)) - self.assertEqual(page, b'LOLWAT?') - self.assertEqual(fn, 'lolercoaster.html') - self.assertEqual(mimet, 'text/plain') - - page, fn, mimet = self.wg.getFileNameMime( - "http://localhost:{}/filename_mime/explicit-html-mime".format(self.mock_server_port)) - self.assertEqual(page, 'LOLWAT?') - self.assertEqual(fn, 'lolercoaster.html') - self.assertEqual(mimet, 'text/html') - - def test_get_head(self): - inurl_1 = "http://localhost:{}".format(self.mock_server_port) - nurl_1 = self.wg.getHead(inurl_1) - self.assertEqual(inurl_1, nurl_1) - - inurl_2 = "http://localhost:{}/filename_mime/content-disposition".format(self.mock_server_port) - nurl_2 = self.wg.getHead(inurl_2) - self.assertEqual(inurl_2, nurl_2) - - def test_redirect_handling(self): - - inurl_1 = "http://localhost:{}/redirect/from-1".format(self.mock_server_port) - ctnt_1 = self.wg.getpage(inurl_1) - self.assertEqual(ctnt_1, b"Redirect-To-1") - - inurl_2 = "http://localhost:{}/redirect/from-2".format(self.mock_server_port) - ctnt_2 = self.wg.getpage(inurl_2) - self.assertEqual(ctnt_2, b"Redirect-To-2") - - inurl_3 = "http://localhost:{}/redirect/from-1".format(self.mock_server_port) - outurl_3 = "http://localhost:{}/redirect/to-1".format(self.mock_server_port) - nurl_3 = self.wg.getHead(inurl_3) - self.assertEqual(outurl_3, nurl_3) - - inurl_4 = "http://localhost:{}/redirect/from-2".format(self.mock_server_port) - outurl_4 = "http://localhost:{}/redirect/to-2".format(self.mock_server_port) - nurl_4 = self.wg.getHead(inurl_4) - self.assertEqual(outurl_4, nurl_4) - - # This is a redirect without the actual redirect - with self.assertRaises(WebRequest.FetchFailureError): - inurl_5 = "http://localhost:{}/redirect/bad-1".format(self.mock_server_port) - nurl_5 = self.wg.getHead(inurl_5) - - # This is a infinitely recursive redirect. - with self.assertRaises(WebRequest.FetchFailureError): - inurl_6 = "http://localhost:{}/redirect/bad-2".format(self.mock_server_port) - nurl_6 = self.wg.getHead(inurl_6) - - # This is a infinitely recursive redirect. - with self.assertRaises(WebRequest.FetchFailureError): - inurl_6 = "http://localhost:{}/redirect/bad-3".format(self.mock_server_port) - nurl_6 = self.wg.getHead(inurl_6) - - inurl_7 = "http://localhost:{}/redirect/from-3".format(self.mock_server_port) - # Assumes localhost resolves to 127.0.0.1. Is this ever not true? TCPv6? - outurl_7 = "http://127.0.0.1:{}/".format(self.mock_server_port) - nurl_7 = self.wg.getHead(inurl_7) - self.assertEqual(outurl_7, nurl_7) - - def test_http_auth(self): - wg_1 = WebRequest.WebGetRobust(creds=[("localhost:{}".format(self.mock_server_port), "lol", "wat")]) - page = wg_1.getpage("http://localhost:{}/password/expect".format(self.mock_server_port)) - self.assertEqual(page, b'Password Ok?') - - wg_2 = WebRequest.WebGetRobust(creds=[("localhost:{}".format(self.mock_server_port), "lol", "nope")]) - page = wg_2.getpage("http://localhost:{}/password/expect".format(self.mock_server_port)) - self.assertEqual(page, b'Password Bad!')