-booruScraper/util/WebRequest/PhantomJSMixin.py

278 lines
7.9 KiB
Python

#!/usr/bin/python3
import time
import random
import socket
import urllib.parse
import http.cookiejar
import bs4
import selenium.webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
class title_not_contains(object):
""" An expectation for checking that the title *does not* contain a case-sensitive
substring. title is the fragment of title expected
returns True when the title matches, False otherwise
"""
def __init__(self, title):
self.title = title
def __call__(self, driver):
return self.title not in driver.title
#pylint: disable-msg=E1101, C0325, R0201, W0702, W0703
def wait_for(condition_function):
start_time = time.time()
while time.time() < start_time + 3:
if condition_function():
return True
else:
time.sleep(0.1)
raise Exception(
'Timeout waiting for {}'.format(condition_function.__name__)
)
class load_delay_context_manager(object):
def __init__(self, browser):
self.browser = browser
def __enter__(self):
self.old_page = self.browser.find_element_by_tag_name('html')
def page_has_loaded(self):
new_page = self.browser.find_element_by_tag_name('html')
return new_page.id != self.old_page.id
def __exit__(self, *_):
wait_for(self.page_has_loaded)
class WebGetPjsMixin(object):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.pjs_driver = None
def _initPjsWebDriver(self):
if self.pjs_driver:
self.pjs_driver.quit()
dcap = dict(DesiredCapabilities.PHANTOMJS)
wgSettings = dict(self.browserHeaders)
# Install the headers from the WebGet class into phantomjs
dcap["phantomjs.page.settings.userAgent"] = wgSettings.pop('User-Agent')
for headerName in wgSettings:
if headerName != 'Accept-Encoding':
dcap['phantomjs.page.customHeaders.{header}'.format(header=headerName)] = wgSettings[headerName]
self.pjs_driver = selenium.webdriver.PhantomJS(desired_capabilities=dcap)
self.pjs_driver.set_window_size(1280, 1024)
def _syncIntoPjsWebDriver(self):
'''
So selenium is completely retarded, and you can't just set cookes, you have to
be navigated to the domain for which you want to set cookies.
This is extra double-plus idiotic, as it means you can't set cookies up
before navigating.
Sigh.
'''
pass
# for cookie in self.getCookies():
# print("Cookie: ", cookie)
# cookurl = [
# "http" if cookieDict['httponly'] else "https", # scheme 0 URL scheme specifier
# cookie.domain, # netloc 1 Network location part
# "/", # path 2 Hierarchical path
# "", # params 3 Parameters for last path element
# "", # query 4 Query component
# "", # fragment 5 Fragment identifier
# ]
# cdat = {
# 'name' : cookie.name,
# 'value' : cookie.value,
# 'domain' : cookie.domain,
# 'path' :
# 'expiry' :
# }
# print("CDat: ", cdat)
# self.pjs_driver.add_cookie(cdat)
def _syncOutOfPjsWebDriver(self):
for cookie in self.pjs_driver.get_cookies():
self.addSeleniumCookie(cookie)
def getItemPhantomJS(self, itemUrl):
self.log.info("Fetching page for URL: '%s' with PhantomJS" % itemUrl)
if not self.pjs_driver:
self._initPjsWebDriver()
self._syncIntoPjsWebDriver()
with load_delay_context_manager(self.pjs_driver):
self.pjs_driver.get(itemUrl)
time.sleep(3)
fileN = urllib.parse.unquote(urllib.parse.urlparse(self.pjs_driver.current_url)[2].split("/")[-1])
fileN = bs4.UnicodeDammit(fileN).unicode_markup
self._syncOutOfPjsWebDriver()
# Probably a bad assumption
mType = "text/html"
# So, self.pjs_driver.page_source appears to be the *compressed* page source as-rendered. Because reasons.
source = self.pjs_driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
assert source != '<head></head><body></body>'
source = "<html>"+source+"</html>"
return source, fileN, mType
def getHeadTitlePhantomJS(self, url, referrer=None):
self.getHeadPhantomJS(url, referrer)
ret = {
'url' : self.pjs_driver.current_url,
'title' : self.pjs_driver.title,
}
return ret
def getHeadPhantomJS(self, url, referrer=None):
self.log.info("Getting HEAD with PhantomJS")
if not self.pjs_driver:
self._initPjsWebDriver()
self._syncIntoPjsWebDriver()
def try_get(loc_url):
tries = 3
for x in range(9999):
try:
self.pjs_driver.get(loc_url)
time.sleep(random.uniform(2, 6))
return
except socket.timeout as e:
if x > tries:
raise e
if referrer:
try_get(referrer)
try_get(url)
self._syncOutOfPjsWebDriver()
return self.pjs_driver.current_url
def addSeleniumCookie(self, cookieDict):
'''
Install a cookie exported from a selenium webdriver into
the active opener
'''
# print cookieDict
cookie = http.cookiejar.Cookie(
version = 0,
name = cookieDict['name'],
value = cookieDict['value'],
port = None,
port_specified = False,
domain = cookieDict['domain'],
domain_specified = True,
domain_initial_dot = False,
path = cookieDict['path'],
path_specified = False,
secure = cookieDict['secure'],
expires = cookieDict['expiry'] if 'expiry' in cookieDict else None,
discard = False,
comment = None,
comment_url = None,
rest = {"httponly":"%s" % cookieDict['httponly']},
rfc2109 = False
)
self.addCookie(cookie)
def __del__(self):
# print("PhantomJS __del__")
if self.pjs_driver != None:
self.pjs_driver.quit()
sup = super()
if hasattr(sup, '__del__'):
sup.__del__()
def stepThroughCloudFlare_pjs(self, url, titleContains='', titleNotContains=''):
'''
Use Selenium+PhantomJS to access a resource behind cloudflare protection.
Params:
``url`` - The URL to access that is protected by cloudflare
``titleContains`` - A string that is in the title of the protected page, and NOT the
cloudflare intermediate page. The presence of this string in the page title
is used to determine whether the cloudflare protection has been successfully
penetrated.
The current WebGetRobust headers are installed into the selenium browser, which
is then used to access the protected resource.
Once the protected page has properly loaded, the cloudflare access cookie is
then extracted from the selenium browser, and installed back into the WebGetRobust
instance, so it can continue to use the cloudflare auth in normal requests.
'''
if (not titleContains) and (not titleNotContains):
raise ValueError("You must pass either a string the title should contain, or a string the title shouldn't contain!")
if titleContains and titleNotContains:
raise ValueError("You can only pass a single conditional statement!")
self.log.info("Attempting to access page through cloudflare browser verification.")
if not self.pjs_driver:
self._initPjsWebDriver()
self._syncIntoPjsWebDriver()
self.pjs_driver.get(url)
if titleContains:
condition = EC.title_contains(titleContains)
elif titleNotContains:
condition = title_not_contains(titleNotContains)
else:
raise ValueError("Wat?")
try:
WebDriverWait(self.pjs_driver, 20).until(condition)
success = True
self.log.info("Successfully accessed main page!")
except TimeoutException:
self.log.error("Could not pass through cloudflare blocking!")
success = False
# Add cookies to cookiejar
self._syncOutOfPjsWebDriver()
self.__syncCookiesFromFile()
return success