278 lines
7.9 KiB
Python
278 lines
7.9 KiB
Python
#!/usr/bin/python3
|
|
|
|
import time
|
|
import random
|
|
import socket
|
|
import urllib.parse
|
|
import http.cookiejar
|
|
import bs4
|
|
import selenium.webdriver
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
from selenium.common.exceptions import TimeoutException
|
|
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
|
|
|
|
|
|
|
|
|
class title_not_contains(object):
|
|
""" An expectation for checking that the title *does not* contain a case-sensitive
|
|
substring. title is the fragment of title expected
|
|
returns True when the title matches, False otherwise
|
|
"""
|
|
def __init__(self, title):
|
|
self.title = title
|
|
|
|
|
|
def __call__(self, driver):
|
|
return self.title not in driver.title
|
|
|
|
#pylint: disable-msg=E1101, C0325, R0201, W0702, W0703
|
|
|
|
def wait_for(condition_function):
|
|
start_time = time.time()
|
|
while time.time() < start_time + 3:
|
|
if condition_function():
|
|
return True
|
|
else:
|
|
time.sleep(0.1)
|
|
raise Exception(
|
|
'Timeout waiting for {}'.format(condition_function.__name__)
|
|
)
|
|
|
|
class load_delay_context_manager(object):
|
|
|
|
def __init__(self, browser):
|
|
self.browser = browser
|
|
|
|
def __enter__(self):
|
|
self.old_page = self.browser.find_element_by_tag_name('html')
|
|
|
|
def page_has_loaded(self):
|
|
new_page = self.browser.find_element_by_tag_name('html')
|
|
return new_page.id != self.old_page.id
|
|
|
|
def __exit__(self, *_):
|
|
wait_for(self.page_has_loaded)
|
|
|
|
|
|
class WebGetPjsMixin(object):
|
|
def __init__(self, *args, **kwargs):
|
|
super().__init__(*args, **kwargs)
|
|
|
|
self.pjs_driver = None
|
|
|
|
def _initPjsWebDriver(self):
|
|
if self.pjs_driver:
|
|
self.pjs_driver.quit()
|
|
dcap = dict(DesiredCapabilities.PHANTOMJS)
|
|
wgSettings = dict(self.browserHeaders)
|
|
# Install the headers from the WebGet class into phantomjs
|
|
dcap["phantomjs.page.settings.userAgent"] = wgSettings.pop('User-Agent')
|
|
for headerName in wgSettings:
|
|
if headerName != 'Accept-Encoding':
|
|
dcap['phantomjs.page.customHeaders.{header}'.format(header=headerName)] = wgSettings[headerName]
|
|
|
|
self.pjs_driver = selenium.webdriver.PhantomJS(desired_capabilities=dcap)
|
|
self.pjs_driver.set_window_size(1280, 1024)
|
|
|
|
|
|
def _syncIntoPjsWebDriver(self):
|
|
'''
|
|
So selenium is completely retarded, and you can't just set cookes, you have to
|
|
be navigated to the domain for which you want to set cookies.
|
|
This is extra double-plus idiotic, as it means you can't set cookies up
|
|
before navigating.
|
|
Sigh.
|
|
'''
|
|
pass
|
|
# for cookie in self.getCookies():
|
|
# print("Cookie: ", cookie)
|
|
|
|
# cookurl = [
|
|
# "http" if cookieDict['httponly'] else "https", # scheme 0 URL scheme specifier
|
|
# cookie.domain, # netloc 1 Network location part
|
|
# "/", # path 2 Hierarchical path
|
|
# "", # params 3 Parameters for last path element
|
|
# "", # query 4 Query component
|
|
# "", # fragment 5 Fragment identifier
|
|
# ]
|
|
|
|
# cdat = {
|
|
# 'name' : cookie.name,
|
|
# 'value' : cookie.value,
|
|
# 'domain' : cookie.domain,
|
|
# 'path' :
|
|
# 'expiry' :
|
|
# }
|
|
# print("CDat: ", cdat)
|
|
|
|
# self.pjs_driver.add_cookie(cdat)
|
|
|
|
|
|
def _syncOutOfPjsWebDriver(self):
|
|
for cookie in self.pjs_driver.get_cookies():
|
|
self.addSeleniumCookie(cookie)
|
|
|
|
|
|
def getItemPhantomJS(self, itemUrl):
|
|
self.log.info("Fetching page for URL: '%s' with PhantomJS" % itemUrl)
|
|
|
|
if not self.pjs_driver:
|
|
self._initPjsWebDriver()
|
|
self._syncIntoPjsWebDriver()
|
|
|
|
with load_delay_context_manager(self.pjs_driver):
|
|
self.pjs_driver.get(itemUrl)
|
|
time.sleep(3)
|
|
|
|
fileN = urllib.parse.unquote(urllib.parse.urlparse(self.pjs_driver.current_url)[2].split("/")[-1])
|
|
fileN = bs4.UnicodeDammit(fileN).unicode_markup
|
|
|
|
self._syncOutOfPjsWebDriver()
|
|
|
|
# Probably a bad assumption
|
|
mType = "text/html"
|
|
|
|
# So, self.pjs_driver.page_source appears to be the *compressed* page source as-rendered. Because reasons.
|
|
source = self.pjs_driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
|
|
|
|
assert source != '<head></head><body></body>'
|
|
|
|
source = "<html>"+source+"</html>"
|
|
return source, fileN, mType
|
|
|
|
|
|
|
|
def getHeadTitlePhantomJS(self, url, referrer=None):
|
|
self.getHeadPhantomJS(url, referrer)
|
|
ret = {
|
|
'url' : self.pjs_driver.current_url,
|
|
'title' : self.pjs_driver.title,
|
|
}
|
|
return ret
|
|
|
|
def getHeadPhantomJS(self, url, referrer=None):
|
|
self.log.info("Getting HEAD with PhantomJS")
|
|
|
|
if not self.pjs_driver:
|
|
self._initPjsWebDriver()
|
|
self._syncIntoPjsWebDriver()
|
|
|
|
def try_get(loc_url):
|
|
tries = 3
|
|
for x in range(9999):
|
|
try:
|
|
self.pjs_driver.get(loc_url)
|
|
time.sleep(random.uniform(2, 6))
|
|
return
|
|
except socket.timeout as e:
|
|
if x > tries:
|
|
raise e
|
|
if referrer:
|
|
try_get(referrer)
|
|
try_get(url)
|
|
|
|
self._syncOutOfPjsWebDriver()
|
|
|
|
return self.pjs_driver.current_url
|
|
|
|
def addSeleniumCookie(self, cookieDict):
|
|
'''
|
|
Install a cookie exported from a selenium webdriver into
|
|
the active opener
|
|
'''
|
|
# print cookieDict
|
|
cookie = http.cookiejar.Cookie(
|
|
version = 0,
|
|
name = cookieDict['name'],
|
|
value = cookieDict['value'],
|
|
port = None,
|
|
port_specified = False,
|
|
domain = cookieDict['domain'],
|
|
domain_specified = True,
|
|
domain_initial_dot = False,
|
|
path = cookieDict['path'],
|
|
path_specified = False,
|
|
secure = cookieDict['secure'],
|
|
expires = cookieDict['expiry'] if 'expiry' in cookieDict else None,
|
|
discard = False,
|
|
comment = None,
|
|
comment_url = None,
|
|
rest = {"httponly":"%s" % cookieDict['httponly']},
|
|
rfc2109 = False
|
|
)
|
|
|
|
self.addCookie(cookie)
|
|
|
|
|
|
def __del__(self):
|
|
# print("PhantomJS __del__")
|
|
if self.pjs_driver != None:
|
|
self.pjs_driver.quit()
|
|
|
|
sup = super()
|
|
if hasattr(sup, '__del__'):
|
|
sup.__del__()
|
|
|
|
|
|
def stepThroughCloudFlare_pjs(self, url, titleContains='', titleNotContains=''):
|
|
'''
|
|
Use Selenium+PhantomJS to access a resource behind cloudflare protection.
|
|
|
|
Params:
|
|
``url`` - The URL to access that is protected by cloudflare
|
|
``titleContains`` - A string that is in the title of the protected page, and NOT the
|
|
cloudflare intermediate page. The presence of this string in the page title
|
|
is used to determine whether the cloudflare protection has been successfully
|
|
penetrated.
|
|
|
|
The current WebGetRobust headers are installed into the selenium browser, which
|
|
is then used to access the protected resource.
|
|
|
|
Once the protected page has properly loaded, the cloudflare access cookie is
|
|
then extracted from the selenium browser, and installed back into the WebGetRobust
|
|
instance, so it can continue to use the cloudflare auth in normal requests.
|
|
|
|
'''
|
|
|
|
if (not titleContains) and (not titleNotContains):
|
|
raise ValueError("You must pass either a string the title should contain, or a string the title shouldn't contain!")
|
|
|
|
if titleContains and titleNotContains:
|
|
raise ValueError("You can only pass a single conditional statement!")
|
|
|
|
self.log.info("Attempting to access page through cloudflare browser verification.")
|
|
|
|
if not self.pjs_driver:
|
|
self._initPjsWebDriver()
|
|
self._syncIntoPjsWebDriver()
|
|
|
|
|
|
self.pjs_driver.get(url)
|
|
|
|
if titleContains:
|
|
condition = EC.title_contains(titleContains)
|
|
elif titleNotContains:
|
|
condition = title_not_contains(titleNotContains)
|
|
else:
|
|
raise ValueError("Wat?")
|
|
|
|
|
|
try:
|
|
WebDriverWait(self.pjs_driver, 20).until(condition)
|
|
success = True
|
|
self.log.info("Successfully accessed main page!")
|
|
except TimeoutException:
|
|
self.log.error("Could not pass through cloudflare blocking!")
|
|
success = False
|
|
# Add cookies to cookiejar
|
|
|
|
self._syncOutOfPjsWebDriver()
|
|
|
|
self.__syncCookiesFromFile()
|
|
|
|
return success
|
|
|
|
|