-booruScraper/scraper/modules/e621Scrape.py


import traceback
import urllib.error
import urllib.parse
import re
import time
import datetime

import parsedatetime
import sqlalchemy.exc

import scraper.runstate
import scraper.fetchBase
import scraper.database as db

import WebRequest

class E621Fetcher(scraper.fetchBase.AbstractFetcher):

	pluginkey         = 'e621'
	loggerpath        = "Main.e621"

	def __init__(self):
		super().__init__()

	def get_content_count_max(self):
		soup = self.wg.getSoup('https://e621.net/post')

		thumbs = soup.find_all('span', class_='thumb')
		tids = [tmp.get("id", "").strip("p") for tmp in thumbs]
		tids = [int(tmp) for tmp in tids if tmp]
		maxid = max(tids)

		return maxid

	def extractTags(self, job, tagsection):

		characterlis = tagsection.find_all('li', class_='tag-type-character')
		specieslis   = tagsection.find_all('li', class_='tag-type-species')
		copyrlis     = tagsection.find_all('li', class_='tag-type-copyright')
		artistlis    = tagsection.find_all('li', class_='tag-type-artist')
		taglis       = tagsection.find_all('li', class_='tag-type-general')


		tags = []
		for tagli in taglis:
			tag = tagli.find_all('a')[-1].get_text()
			tags.append(tag)

		for speciesli in specieslis:
			tag = speciesli.find_all('a')[-1].get_text()
			tags.append("species " + tag)

		for copyrli in copyrlis:
			tag = copyrli.find_all('a')[-1].get_text()
			tags.append("copyright " + tag)

		artists = []
		for artistli in artistlis:
			if artistli.find_all('a'):
				artist = artistli.find_all('a')[-1].get_text()
				artists.append(artist)

		characters = []
		for characterli in characterlis:
			character = characterli.find_all('a')[-1].get_text()
			characters.append(character)

		for tag in tags:
			if tag not in job.tags:
				job.tags.append(tag)
		for artist in artists:
			if artist not in job.artist:
				job.artist.append(artist)
		for character in characters:
			if character not in job.character:
				job.character.append(character)

	def getxy(self, instr):
		found = re.search(r"(\d+)x(\d+)", instr)
		x, y = found.groups()
		return x, y

	def extractInfo(self, job, infosection):
		imgurl = None
		for li in infosection.find_all("li"):
			rawt = li.get_text()
			if ":" in rawt:
				name, val = rawt.split(":", 1)
			elif "Posted" in rawt:
				name, val = rawt.split("Posted", 1)
				name = "Posted"
			else:
				print("Don't know how to parse!")
				print("Raw - '{}'".format(rawt))


			name = name.strip()
			val = val.strip()

			if name == 'Rating':
				pass
			elif name == 'Favorites':
				pass
			elif name == 'ID':
				pass
			elif name == 'Favorited by':
				pass
			elif name == 'Score':
				val = val.strip()
				val = val.split()[0]
				job.score = val
			elif name == 'Posted':
				cal = parsedatetime.Calendar()
				val = val.split("by")[0]
				tstruct, pstat = cal.parse(val)
				assert pstat == 1 or pstat == 2 or pstat == 3
				job.posted = datetime.datetime.fromtimestamp(time.mktime(tstruct))
			elif name == 'Size':
				res, fsize = val.split("(")
				fsize, res = fsize.strip(), res.replace("(", " ").replace(")", " ").strip()
				job.imgx, job.imgy = self.getxy(res)

				link = li.find("a")
				if link:
					imgurl = link['href']

			elif name == 'Status':
				pass
			elif name in ['Approver', 'Id', 'Source', 'Uploader']:
				pass
			else:
				self.log.warning("Unknown item key-value:")
				self.log.warning("	'{}' -> '{}'".format(name, val))

		return imgurl

	def extractMeta(self, job, soup):
		tagsection = soup.find('ul', id='tag-sidebar')
		assert tagsection
		infosection = soup.find('div', id='stats')
		assert infosection
		imgurl = self.extractInfo(job, infosection)
		self.extractTags(job, tagsection)
		return imgurl


	def fetchImage(self, job, url, srcurl):
		url = urllib.parse.urljoin(srcurl, url)
		fname = url.split("/")[-1]


		cont = self.wg.getpage(url, addlHeaders={'Referer':srcurl})

		fpath = self.saveFileRow(job, fname, cont)
		self.log.info("Saved file to path: '%s'", fpath)

		job.filename = fname
		job.filepath = fpath
		job.state    = 'complete'
		db.session.commit()
		# print(fname)

	def processJob(self, job):
		pageurl = 'https://e621.net/post/show/{}'.format(job.postid)
		try:
			soup = self.wg.getSoup(pageurl)
		except WebRequest.WebGetException:
			job.state = 'error'
			job.err_str = 'failure fetching container page'
			self.log.warning("Marking %s as %s (%s)", job.id, job.state, job.err_str)
			db.session.commit()
			return

		text = soup.get_text()
		if 'You need a gold account to see this image.' in text:
			job.state = 'removed'
			job.err_str = 'requires account'
			self.log.warning("Marking %s as %s (%s)", job.id, job.state, job.err_str)
			db.session.commit()
			return
		if 'This post was deleted by' in text:
			job.state = 'removed'
			job.err_str = 'post deleted'
			self.log.warning("Marking %s as %s (%s)", job.id, job.state, job.err_str)
			db.session.commit()
			return
		if 'Save this flash' in text:
			job.state = 'disabled'
			job.err_str = 'content is flash .swf'
			self.log.warning("Marking %s as %s (%s)", job.id, job.state, job.err_str)
			db.session.commit()
			return
		err = 0
		while err < 5:
			try:
				imgurl = self.extractMeta(job, soup)
				if imgurl:
					self.fetchImage(job, imgurl, pageurl)
				else:
					self.log.info("No image found for URL: '%s'", pageurl)
					job.state = 'error'
					job.err_str = 'failed to find image!'
					self.log.warning("Marking %s as %s (%s)", job.id, job.state, job.err_str)
				break
			except AssertionError:
				self.log.info("Assertion error?: '%s'", pageurl)
				traceback.print_exc()
				job.state = 'error'
				job.err_str = 'Assertion failure?'
				self.log.warning("Marking %s as %s (%s)", job.id, job.state, job.err_str)
				db.session.rollback()
				break

			except sqlalchemy.exc.IntegrityError:
				err += 1
				db.session.rollback()


	def retreiveItem(self):
		job = self.get_job()
		if not job:
			return False

		self.processJob(job)
		return True


def run(indice):
	print("Runner {}!".format(indice))
	fetcher = E621Fetcher()
	remainingTasks = True

	try:
		while remainingTasks and scraper.runstate.run:
			remainingTasks = fetcher.retreiveItem()
	except KeyboardInterrupt:
		return
	except:
		print("Unhandled exception!")
		traceback.print_exc()
		raise


def test():
	fetcher = E621Fetcher()
	soup = fetcher.wg.getSoup("https://e621.net/post/show/28024")

	tmp = lambda: None
	tmp.tags = []
	tmp.character = []
	tmp.artist = []
	tmp.file = []

	fetcher.extractMeta(tmp, soup)


if __name__ == '__main__':

	import util.logSetup
	util.logSetup.initLogging()

	test()
	# run(1)
Added moar sites. 2017-04-14 20:38:05 -05:00
			`import traceback`
			`import urllib.error`
			`import urllib.parse`
			`import re`
			`import time`
			`import datetime`

Restructure mostly done, it at least starts running. I need to rework the db stuff a LOT though. 2017-11-21 00:57:18 -06:00			`import parsedatetime`
			`import sqlalchemy.exc`

			`import scraper.runstate`
			`import scraper.fetchBase`
			`import scraper.database as db`
Added moar sites. 2017-04-14 20:38:05 -05:00
Fixing stuff. Apparently I somehow completely fucked up the xbooru fetcher. Wat. 2018-04-22 22:49:13 -05:00			`import WebRequest`
Ok, most of the scrapers are go. Whoooo! 2017-11-23 22:16:10 -06:00
Restructure mostly done, it at least starts running. I need to rework the db stuff a LOT though. 2017-11-21 00:57:18 -06:00			`class E621Fetcher(scraper.fetchBase.AbstractFetcher):`
Added moar sites. 2017-04-14 20:38:05 -05:00
More work, most of the execution structure is implemented. Now, I have to update the plugins and make sure they're still OK. 2017-11-22 00:30:55 -06:00			`pluginkey = 'e621'`
			`loggerpath = "Main.e621"`
Added moar sites. 2017-04-14 20:38:05 -05:00
			`def __init__(self):`
Restructure mostly done, it at least starts running. I need to rework the db stuff a LOT though. 2017-11-21 00:57:18 -06:00			`super().__init__()`
Added moar sites. 2017-04-14 20:38:05 -05:00
Fixing stuff. Apparently I somehow completely fucked up the xbooru fetcher. Wat. 2018-04-22 22:49:13 -05:00			`def get_content_count_max(self):`
			`soup = self.wg.getSoup('https://e621.net/post')`

			`thumbs = soup.find_all('span', class_='thumb')`
			`tids = [tmp.get("id", "").strip("p") for tmp in thumbs]`
			`tids = [int(tmp) for tmp in tids if tmp]`
			`maxid = max(tids)`

			`return maxid`

Added moar sites. 2017-04-14 20:38:05 -05:00			`def extractTags(self, job, tagsection):`

			`characterlis = tagsection.find_all('li', class_='tag-type-character')`
			`specieslis = tagsection.find_all('li', class_='tag-type-species')`
			`copyrlis = tagsection.find_all('li', class_='tag-type-copyright')`
			`artistlis = tagsection.find_all('li', class_='tag-type-artist')`
			`taglis = tagsection.find_all('li', class_='tag-type-general')`


			`tags = []`
			`for tagli in taglis:`
			`tag = tagli.find_all('a')[-1].get_text()`
			`tags.append(tag)`

			`for speciesli in specieslis:`
			`tag = speciesli.find_all('a')[-1].get_text()`
			`tags.append("species " + tag)`

			`for copyrli in copyrlis:`
			`tag = copyrli.find_all('a')[-1].get_text()`
			`tags.append("copyright " + tag)`

			`artists = []`
			`for artistli in artistlis:`
More fixes. 2017-11-23 22:58:24 -06:00			`if artistli.find_all('a'):`
			`artist = artistli.find_all('a')[-1].get_text()`
			`artists.append(artist)`
Added moar sites. 2017-04-14 20:38:05 -05:00
			`characters = []`
			`for characterli in characterlis:`
			`character = characterli.find_all('a')[-1].get_text()`
			`characters.append(character)`

			`for tag in tags:`
			`if tag not in job.tags:`
			`job.tags.append(tag)`
			`for artist in artists:`
			`if artist not in job.artist:`
			`job.artist.append(artist)`
			`for character in characters:`
			`if character not in job.character:`
			`job.character.append(character)`

			`def getxy(self, instr):`
			`found = re.search(r"(\d+)x(\d+)", instr)`
			`x, y = found.groups()`
			`return x, y`

			`def extractInfo(self, job, infosection):`
			`imgurl = None`
			`for li in infosection.find_all("li"):`
			`rawt = li.get_text()`
			`if ":" in rawt:`
			`name, val = rawt.split(":", 1)`
			`elif "Posted" in rawt:`
			`name, val = rawt.split("Posted", 1)`
			`name = "Posted"`
			`else:`
			`print("Don't know how to parse!")`
			`print("Raw - '{}'".format(rawt))`


			`name = name.strip()`
			`val = val.strip()`

			`if name == 'Rating':`
			`pass`
			`elif name == 'Favorites':`
			`pass`
			`elif name == 'ID':`
			`pass`
			`elif name == 'Favorited by':`
			`pass`
			`elif name == 'Score':`
			`val = val.strip()`
			`val = val.split()[0]`
			`job.score = val`
			`elif name == 'Posted':`
			`cal = parsedatetime.Calendar()`
			`val = val.split("by")[0]`
			`tstruct, pstat = cal.parse(val)`
			`assert pstat == 1 or pstat == 2 or pstat == 3`
			`job.posted = datetime.datetime.fromtimestamp(time.mktime(tstruct))`
			`elif name == 'Size':`
			`res, fsize = val.split("(")`
			`fsize, res = fsize.strip(), res.replace("(", " ").replace(")", " ").strip()`
			`job.imgx, job.imgy = self.getxy(res)`

			`link = li.find("a")`
			`if link:`
			`imgurl = link['href']`

			`elif name == 'Status':`
			`pass`
			`elif name in ['Approver', 'Id', 'Source', 'Uploader']:`
			`pass`
			`else:`
			`self.log.warning("Unknown item key-value:")`
			`self.log.warning(" '{}' -> '{}'".format(name, val))`

			`return imgurl`

			`def extractMeta(self, job, soup):`
			`tagsection = soup.find('ul', id='tag-sidebar')`
			`assert tagsection`
			`infosection = soup.find('div', id='stats')`
			`assert infosection`
			`imgurl = self.extractInfo(job, infosection)`
			`self.extractTags(job, tagsection)`
			`return imgurl`



			`def fetchImage(self, job, url, srcurl):`
			`url = urllib.parse.urljoin(srcurl, url)`
			`fname = url.split("/")[-1]`


			`cont = self.wg.getpage(url, addlHeaders={'Referer':srcurl})`

I think this should bring everything up to functional. 2017-11-23 01:08:27 -06:00			`fpath = self.saveFileRow(job, fname, cont)`
Added moar sites. 2017-04-14 20:38:05 -05:00			`self.log.info("Saved file to path: '%s'", fpath)`

			`job.filename = fname`
			`job.filepath = fpath`
I think this should bring everything up to functional. 2017-11-23 01:08:27 -06:00			`job.state = 'complete'`
Added moar sites. 2017-04-14 20:38:05 -05:00			`db.session.commit()`
			`# print(fname)`

			`def processJob(self, job):`
			`pageurl = 'https://e621.net/post/show/{}'.format(job.postid)`
			`try:`
			`soup = self.wg.getSoup(pageurl)`
Fixing stuff. Apparently I somehow completely fucked up the xbooru fetcher. Wat. 2018-04-22 22:49:13 -05:00			`except WebRequest.WebGetException:`
I think this should bring everything up to functional. 2017-11-23 01:08:27 -06:00			`job.state = 'error'`
			`job.err_str = 'failure fetching container page'`
Ok, most of the scrapers are go. Whoooo! 2017-11-23 22:16:10 -06:00			`self.log.warning("Marking %s as %s (%s)", job.id, job.state, job.err_str)`
Added moar sites. 2017-04-14 20:38:05 -05:00			`db.session.commit()`
			`return`

			`text = soup.get_text()`
			`if 'You need a gold account to see this image.' in text:`
I think this should bring everything up to functional. 2017-11-23 01:08:27 -06:00			`job.state = 'removed'`
			`job.err_str = 'requires account'`
Ok, most of the scrapers are go. Whoooo! 2017-11-23 22:16:10 -06:00			`self.log.warning("Marking %s as %s (%s)", job.id, job.state, job.err_str)`
Added moar sites. 2017-04-14 20:38:05 -05:00			`db.session.commit()`
			`return`
More fixes. 2017-11-23 22:58:24 -06:00			`if 'This post was deleted by' in text:`
I think this should bring everything up to functional. 2017-11-23 01:08:27 -06:00			`job.state = 'removed'`
			`job.err_str = 'post deleted'`
Ok, most of the scrapers are go. Whoooo! 2017-11-23 22:16:10 -06:00			`self.log.warning("Marking %s as %s (%s)", job.id, job.state, job.err_str)`
Added moar sites. 2017-04-14 20:38:05 -05:00			`db.session.commit()`
			`return`
			`if 'Save this flash' in text:`
I think this should bring everything up to functional. 2017-11-23 01:08:27 -06:00			`job.state = 'disabled'`
			`job.err_str = 'content is flash .swf'`
Ok, most of the scrapers are go. Whoooo! 2017-11-23 22:16:10 -06:00			`self.log.warning("Marking %s as %s (%s)", job.id, job.state, job.err_str)`
Added moar sites. 2017-04-14 20:38:05 -05:00			`db.session.commit()`
			`return`
			`err = 0`
			`while err < 5:`
			`try:`
			`imgurl = self.extractMeta(job, soup)`
			`if imgurl:`
			`self.fetchImage(job, imgurl, pageurl)`
			`else:`
			`self.log.info("No image found for URL: '%s'", pageurl)`
I think this should bring everything up to functional. 2017-11-23 01:08:27 -06:00			`job.state = 'error'`
			`job.err_str = 'failed to find image!'`
Ok, most of the scrapers are go. Whoooo! 2017-11-23 22:16:10 -06:00			`self.log.warning("Marking %s as %s (%s)", job.id, job.state, job.err_str)`
Added moar sites. 2017-04-14 20:38:05 -05:00			`break`
			`except AssertionError:`
			`self.log.info("Assertion error?: '%s'", pageurl)`
			`traceback.print_exc()`
I think this should bring everything up to functional. 2017-11-23 01:08:27 -06:00			`job.state = 'error'`
			`job.err_str = 'Assertion failure?'`
Ok, most of the scrapers are go. Whoooo! 2017-11-23 22:16:10 -06:00			`self.log.warning("Marking %s as %s (%s)", job.id, job.state, job.err_str)`
Added moar sites. 2017-04-14 20:38:05 -05:00			`db.session.rollback()`
			`break`

			`except sqlalchemy.exc.IntegrityError:`
			`err += 1`
			`db.session.rollback()`



			`def retreiveItem(self):`
			`job = self.get_job()`
			`if not job:`
			`return False`

			`self.processJob(job)`
			`return True`





			`def run(indice):`
			`print("Runner {}!".format(indice))`
			`fetcher = E621Fetcher()`
			`remainingTasks = True`

			`try:`
Restructure mostly done, it at least starts running. I need to rework the db stuff a LOT though. 2017-11-21 00:57:18 -06:00			`while remainingTasks and scraper.runstate.run:`
Added moar sites. 2017-04-14 20:38:05 -05:00			`remainingTasks = fetcher.retreiveItem()`
			`except KeyboardInterrupt:`
			`return`
			`except:`
			`print("Unhandled exception!")`
			`traceback.print_exc()`
			`raise`





			`def test():`
			`fetcher = E621Fetcher()`
More fixes. 2017-11-23 22:58:24 -06:00			`soup = fetcher.wg.getSoup("https://e621.net/post/show/28024")`

			`tmp = lambda: None`
			`tmp.tags = []`
			`tmp.character = []`
			`tmp.artist = []`
			`tmp.file = []`

			`fetcher.extractMeta(tmp, soup)`
Added moar sites. 2017-04-14 20:38:05 -05:00


			`if __name__ == '__main__':`

More fixes. 2017-11-23 22:58:24 -06:00			`import util.logSetup`
			`util.logSetup.initLogging()`
Added moar sites. 2017-04-14 20:38:05 -05:00
More fixes. 2017-11-23 22:58:24 -06:00			`test()`
			`# run(1)`
Added moar sites. 2017-04-14 20:38:05 -05:00