MOAR SITEZ

2015-08-31 19:37:56 -07:00 · 2015-08-31 19:37:56 -07:00 · 1aa23060c6
parent af85fd8089
commit 1aa23060c6
4 changed files with 253 additions and 11 deletions
--- a/fetchBase.py
+++ b/fetchBase.py
@ -43,11 +43,12 @@ class AbstractFetcher(object, metaclass=abc.ABCMeta):
 		while 1:
 			try:
 				job = db.session.query(db.Releases)               \
-					.filter(db.Releases.dlstate == 0)             \
 					.filter(db.Releases.source == self.pluginkey) \
+					.filter(db.Releases.dlstate == 0)             \
 					.order_by(db.Releases.postid)                 \
-					.limit(1)                                     \
-					.one()
+					.limit(1)
+
+				job = job.scalar()
 				if job == None:
 					return None
 				job.dlstate = 1
--- a/gelbooruFetch.py
+++ b/gelbooruFetch.py
@ -206,7 +206,7 @@ class GelbooruFetcher(object):
 				soup = self.wg.getSoup(pageurl)
 				if 'You are viewing an advertisement' in soup.get_text():
 					self.log.warning("Working around advertisement. Sleeping 10 seconds")
-					time.sleep(10)
+					time.sleep(13)
 				else:
 					break
 			except urllib.error.URLError:
--- a/main.py
+++ b/main.py
@ -6,6 +6,7 @@ logSetup.initLogging()

 import danbooruFetch
 import gelbooruFetch
+import r34xxxScrape
 import runstate
 import concurrent.futures

@ -16,8 +17,9 @@ def insertDanbooruStartingPoints():

 	tmp = db.session.query(db.Releases)           \
 		.filter(db.Releases.postid == 1)          \
-		.filter(db.Releases.source == 'Danbooru') \
-		.count()
+		.filter(db.Releases.source == 'Danbooru')
+
+	tmp = tmp.count()
 	if not tmp:
 		for x in range(2070000):
 			new = db.Releases(dlstate=0, postid=x, source='Danbooru')
@ -52,6 +54,30 @@ def insertGelbooruStartingPoints():
 			# 	db.session.flush()
 			# 	print("Flushed.")
 	db.session.commit()
+def insertR34xxxStartingPoints():
+
+	tmp = db.session.query(db.Releases)           \
+		.filter(db.Releases.postid == 1)          \
+		.filter(db.Releases.source == 'Rule34.xxx') \
+		.count()
+	if not tmp:
+
+		print("Building insert data structure")
+		dat = [{"dlstate" : 0, "postid" : x, "source" : 'Rule34.xxx'} for x in range(1844200)]
+		print("Building insert query")
+		q = db.Releases.__table__.insert().values(dat)
+		print("Built. Doing insert.")
+		db.engine.execute(q)
+		print("Done.")
+		# for x in :
+
+			# new = db.Releases(dlstate=0, postid=x, source='Gelbooru')
+			# # db.session.add(new)
+			# if x % 100000 == 0:
+			# 	print("Loop ", x, "flushing...")
+			# 	db.session.flush()
+			# 	print("Flushed.")
+	db.session.commit()


 def resetDlstate():
@ -64,20 +90,23 @@ def resetDlstate():
 def go():
 	insertDanbooruStartingPoints()
 	insertGelbooruStartingPoints()
+	insertR34xxxStartingPoints()
 	resetDlstate()


-
+	# r34xxxScrape.run(0)

 	executor = concurrent.futures.ThreadPoolExecutor(max_workers=THREADS)
 	try:
 		# for x in range(2):
 		# executor.submit(danbooruFetch.run, 0)
 		# executor.submit(gelbooruFetch.run, 0)
-		for x in range(THREADS//2):
-			executor.submit(danbooruFetch.run, x)
-		for x in range(THREADS//2):
-			executor.submit(gelbooruFetch.run, x)
+		for x in range(THREADS):
+			executor.submit(r34xxxScrape.run, x)
+		# for x in range(THREADS//2):
+		# 	executor.submit(danbooruFetch.run, x)
+		# for x in range(THREADS//2):
+		# 	executor.submit(gelbooruFetch.run, x)
 		executor.shutdown()
 	except KeyboardInterrupt:
 		print("Waiting for executor.")
--- a/r34xxxScrape.py
+++ b/r34xxxScrape.py
@ -0,0 +1,212 @@
+
+import database as db
+import webFunctions
+import logging
+import traceback
+import sqlalchemy.exc
+import runstate
+import urllib.error
+import urllib.parse
+import re
+import parsedatetime
+import os
+import settings
+import os.path
+import time
+import datetime
+
+import fetchBase
+
+import danbooruFetch
+class R34xxxFetcher(danbooruFetch.DanbooruFetcher):
+
+	pluginkey = 'Rule34.xxx'
+	loggerpath = "Main.Rule34-xxx"
+
+	def __init__(self):
+		self.log = logging.getLogger("Main.Rule34-xxx")
+		self.wg = webFunctions.WebGetRobust(logPath="Main.Rule34-xxx.Web")
+
+	def extractTags(self, job, tagsection):
+
+		characterlis = tagsection.find_all('li', class_='tag-type-character')
+		artistlis    = tagsection.find_all('li', class_='tag-type-artist')
+		taglis       = tagsection.find_all('li', class_='tag-type-general')
+
+
+		tags = []
+		for tagli in taglis:
+			tag = tagli.find('a').get_text()
+			tags.append(tag)
+
+		artists = []
+		for artistli in artistlis:
+			artist = artistli.find('a').get_text()
+			artists.append(artist)
+
+		characters = []
+		for characterli in characterlis:
+			character = characterli.find('a').get_text()
+			characters.append(character)
+
+		for tag in tags:
+			if tag not in job.tags:
+				job.tags.append(tag)
+		for artist in artists:
+			if artist not in job.artist:
+				job.artist.append(artist)
+		for character in characters:
+			if character not in job.character:
+				job.character.append(character)
+
+	def getxy(self, instr):
+		found = re.search(r"\((\d+)x(\d+)\)", instr)
+		x, y = found.groups()
+		return x, y
+
+	def extractInfo(self, job, infosection):
+		imgurl = None
+		for li in infosection.find_all("li"):
+			rawt = li.get_text()
+			name, val = rawt.split(":", 1)
+
+			name = name.strip()
+			val = val.strip()
+
+			if name == 'Rating':
+				pass
+				# job.rating = val
+			elif name == 'Favorites':
+				pass
+				# job.favorites = val
+			elif name == 'Score':
+				val = val.strip()
+				val = val.split()[0]
+				job.score = val
+			elif name == 'Posted':
+				cal = parsedatetime.Calendar()
+				val = val.split("by")[0]
+				tstruct, pstat = cal.parse(val)
+				assert pstat == 1 or pstat == 2
+				job.posted = datetime.datetime.fromtimestamp(time.mktime(tstruct))
+			elif name == 'Size':
+				if not '\n' in val:
+					return False
+				fsize, res = val.split("\n")
+				fsize, res = fsize.strip(), res.strip()
+				job.imgx, job.imgy = self.getxy(res)
+
+				link = li.find("a")
+				if link:
+					imgurl = link['href']
+
+			elif name == 'Status':
+				pass
+				# job.status = val
+				# # Do not try to fetch things that are banned (e.g. removed)
+				# if val == 'Banned':
+				# 	job.dlstate=-2
+			elif name in ['Approver', 'Id', 'Source', 'Uploader']:
+				pass
+			else:
+				self.log.warning("Unknown item key-value:")
+				self.log.warning("	'{}' -> '{}'".format(name, val))
+		return imgurl
+
+	def extractMeta(self, job, soup):
+		tagsection = soup.find('ul', id='tag-sidebar')
+		assert tagsection
+		infosection = soup.find('div', id='stats')
+		assert infosection
+		self.extractTags(job, tagsection)
+		imgurl = self.extractInfo(job, infosection)
+		return imgurl
+
+
+
+	def fetchImage(self, job, url, srcurl):
+		url = urllib.parse.urljoin(srcurl, url)
+		fname = url.split("/")[-1]
+
+
+		cont = self.wg.getpage(url, addlHeaders={'Referer':srcurl})
+
+		fpath = self.saveFile(job, fname, cont)
+		self.log.info("Saved file to path: '%s'", fpath)
+
+		job.filename = fname
+		job.filepath = fpath
+		job.dlstate  = 2
+		db.session.commit()
+		# print(fname)
+
+	def processJob(self, job):
+		pageurl = 'http://rule34.xxx/index.php?page=post&s=view&id={}'.format(job.postid)
+		try:
+			soup = self.wg.getSoup(pageurl)
+		except urllib.error.URLError:
+			job.dlstate=-1
+			db.session.commit()
+			return
+
+		text = soup.get_text()
+		if 'You need a gold account to see this image.' in text:
+			job.dlstate=-3
+			db.session.commit()
+			return
+		if 'This post was deleted for the following reasons' in text:
+			job.dlstate=-4
+			db.session.commit()
+			return
+		if 'Save this flash' in text:
+			job.dlstate=-9
+			db.session.commit()
+			return
+		err = 0
+		while err < 5:
+			try:
+				imgurl = self.extractMeta(job, soup)
+				if imgurl:
+					self.fetchImage(job, imgurl, pageurl)
+				else:
+					self.log.info("No image found for URL: '%s'", pageurl)
+					job.dlstate=-5
+				break
+			except AssertionError:
+				self.log.info("Assertion error?: '%s'", pageurl)
+				job.dlstate=-50
+				db.session.rollback()
+				break
+
+			except sqlalchemy.exc.IntegrityError:
+				err += 1
+				db.session.rollback()
+
+
+
+	def retreiveItem(self):
+		job = self.get_job()
+		if not job:
+			return False
+
+		self.processJob(job)
+		return True
+
+
+
+
+
+def run(indice):
+	print("Runner {}!".format(indice))
+	fetcher = R34xxxFetcher()
+	remainingTasks = True
+
+	try:
+		while remainingTasks and runstate.run:
+			remainingTasks = fetcher.retreiveItem()
+	except KeyboardInterrupt:
+		return
+	except:
+		print("Unhandled exception!")
+		traceback.print_exc()
+		raise