-booruScraper/util/WebRequest/ChromiumMixin.py

#!/usr/bin/python3

import time
import logging
import random
import traceback
import urllib.parse
import threading
import multiprocessing
import gc

import bs4

import ChromeController

# from cachetools import LRUCache

# class ChromeLRUCache(LRUCache):
# 	def __init__(self, *args, **kwargs):
# 		super().__init__(*args, **kwargs)
# 		self.log = logging.getLogger("Main.ChromeInterfaceCache")

# 	def close_chrome(self, pop_key, to_del):
# 		try:
# 			self.log.info("LRU Cache is closing chromium interface for %s", pop_key)
# 			to_del.close()
# 		except Exception:
# 			self.log.error("Exception in chromium teardown!")
# 			for line in traceback.format_exc().split("\n"):
# 				self.log.error("	%s", line)

# 	def popitem(self):
# 		pop_key, to_del = super().popitem()
# 		self.close_chrome(pop_key, to_del)

# 	def close_by_key(self, key):
# 		pop_key, to_del = self.pop(key)
# 		self.close_chrome(pop_key, to_del)


# 	def get_chromium_instance(self, cr_binary, cr_port):
# 		cpid = multiprocessing.current_process().name
# 		ctid = threading.current_thread().name
# 		csid = "{}-{}".format(cpid, ctid)

# 		if csid in self:
# 			self.log.info("Using existing chromium process.")
# 			# We probe the remote chrome to make sure it's not defunct
# 			try:
# 				self[csid].get_current_url()
# 				return self[csid]
# 			except ChromeController.ChromeControllerException:
# 				self.log.error("Chromium appears to be defunct. Creating new")
# 				self.close_by_key(csid)

# 		self.log.info("Creating Chromium process.")
# 		try:
# 			instance = ChromeController.ChromeRemoteDebugInterface(cr_binary, dbg_port = cr_port)
# 		except Exception as e:
# 			self.log.error("Failure creating chromium process!")
# 			for line in traceback.format_exc().split("\n"):
# 				self.log.error("	%s", line)

# 			# Sometimes the old process is around because
# 			# the GC hasn't seen it, and forcing a collection can fix that.
# 			# Yes, this is HORRIBLE.
# 			gc.collect()

# 			raise e

# 		self[csid] = instance
# 		return instance

# CHROME_CACHE = ChromeLRUCache(maxsize=2)


class WebGetCrMixin(object):
	# creds is a list of 3-tuples that gets inserted into the password manager.
	# it is structured [(top_level_url1, username1, password1), (top_level_url2, username2, password2)]
	def __init__(self, *args, **kwargs):
		super().__init__(*args, **kwargs)
		self._cr_binary = "google-chrome"


	def _syncIntoChromium(self, cr):
		# Headers are a list of 2-tuples. We need a dict
		hdict = dict(self.browserHeaders)
		cr.update_headers(hdict)
		for cookie in self.cj:
			cr.set_cookie(cookie)

	def _syncOutOfChromium(self, cr):
		for cookie in cr.get_cookies():
			self.cj.set_cookie(cookie)

	def getItemChromium(self, itemUrl):
		self.log.info("Fetching page for URL: '%s' with Chromium" % itemUrl)

		with ChromeController.ChromeContext(self._cr_binary) as cr:

			self._syncIntoChromium(cr)

			response = cr.blocking_navigate_and_get_source(itemUrl, timeout=10)

			raw_url = cr.get_current_url()
			fileN = urllib.parse.unquote(urllib.parse.urlparse(raw_url)[2].split("/")[-1])
			fileN = bs4.UnicodeDammit(fileN).unicode_markup

			self._syncOutOfChromium(cr)

		# Probably a bad assumption
		if response['binary']:
			mType = "application/x-binary"
		else:
			mType = "text/html"

		# So, self._cr.page_source appears to be the *compressed* page source as-rendered. Because reasons.
		content = response['content']
		return content, fileN, mType

	def getHeadTitleChromium(self, url, referrer=None):
		self.log.info("Getting HEAD with Chromium")
		if not referrer:
			referrer = url

		with ChromeController.ChromeContext(self._cr_binary) as cr:
			self._syncIntoChromium(cr)

			cr.blocking_navigate(referrer)
			time.sleep(random.uniform(2, 6))
			cr.blocking_navigate(url)

			title, cur_url = cr.get_page_url_title()

			self._syncOutOfChromium(cr)

		self.log.info("Resolved URL for %s -> %s", url, cur_url)

		ret = {
			'url': cur_url,
			'title': title,
		}
		return ret

	def getHeadChromium(self, url, referrer=None):
		self.log.info("Getting HEAD with Chromium")
		if not referrer:
			referrer = url

		with ChromeController.ChromeContext(self._cr_binary) as cr:
			self._syncIntoChromium(cr)


			cr.blocking_navigate(referrer)
			time.sleep(random.uniform(2, 6))
			cr.blocking_navigate(url)

			dummy_title, cur_url = cr.get_page_url_title()

			self._syncOutOfChromium(cr)

		return cur_url


	def chromiumGetRenderedItem(self, url):

		with ChromeController.ChromeContext(self._cr_binary) as cr:
			self._syncIntoChromium(cr)

			# get_rendered_page_source
			cr.blocking_navigate(url)


			content = cr.get_rendered_page_source()
			mType = 'text/html'
			fileN = ''
			self._syncOutOfChromium(cr)


		return content, fileN, mType


	def __del__(self):
		# print("ChromiumMixin destructor")
		sup = super()
		if hasattr(sup, '__del__'):
			sup.__del__()

	# def stepThroughCloudFlare_cr(self, url, titleContains='', titleNotContains=''):
	# 	'''
	# 	Use Selenium+Chromium to access a resource behind cloudflare protection.

	# 	Params:
	# 		``url`` - The URL to access that is protected by cloudflare
	# 		``titleContains`` - A string that is in the title of the protected page, and NOT the
	# 			cloudflare intermediate page. The presence of this string in the page title
	# 			is used to determine whether the cloudflare protection has been successfully
	# 			penetrated.

	# 	The current WebGetRobust headers are installed into the selenium browser, which
	# 	is then used to access the protected resource.

	# 	Once the protected page has properly loaded, the cloudflare access cookie is
	# 	then extracted from the selenium browser, and installed back into the WebGetRobust
	# 	instance, so it can continue to use the cloudflare auth in normal requests.

	# 	'''

	# 	if (not titleContains) and (not titleNotContains):
	# 		raise ValueError("You must pass either a string the title should contain, or a string the title shouldn't contain!")

	# 	if titleContains and titleNotContains:
	# 		raise ValueError("You can only pass a single conditional statement!")

	# 	self.log.info("Attempting to access page through cloudflare browser verification.")

	# 	dcap = dict(DesiredCapabilities.Chromium)
	# 	wgSettings = dict(self.browserHeaders)

	# 	# Install the headers from the WebGet class into Chromium
	# 	dcap["Chromium.page.settings.userAgent"] = wgSettings.pop('User-Agent')
	# 	for headerName in wgSettings:
	# 		dcap['Chromium.page.customHeaders.{header}'.format(header=headerName)] = wgSettings[headerName]

	# 	driver = selenium.webdriver.Chromium(desired_capabilities=dcap)
	# 	driver.set_window_size(1024, 768)

	# 	driver.get(url)

	# 	if titleContains:
	# 		condition = EC.title_contains(titleContains)
	# 	elif titleNotContains:
	# 		condition = title_not_contains(titleNotContains)
	# 	else:
	# 		raise ValueError("Wat?")

	# 	try:
	# 		WebDriverWait(driver, 20).until(condition)
	# 		success = True
	# 		self.log.info("Successfully accessed main page!")
	# 	except TimeoutException:
	# 		self.log.error("Could not pass through cloudflare blocking!")
	# 		success = False
	# 	# Add cookies to cookiejar

	# 	for cookie in driver.get_cookies():
	# 		self.addSeleniumCookie(cookie)
	# 		#print cookie[u"value"]

	# 	self.__syncCookiesFromFile()

	# 	return success