Now also fully scrapes gelbooru.

2015-07-11 23:07:48 -07:00 · 2015-07-11 23:07:48 -07:00 · 44b5e4933f
parent 47bc610ffd
commit 44b5e4933f
4 changed files with 100 additions and 47 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
 /__pycache__/*.pyc
 /*.lwp
 /logs
+/*.pyc
--- a/database.py
+++ b/database.py
@ -33,8 +33,9 @@ from settings import DATABASE_PASS          as C_DATABASE_PASS

 SQLALCHEMY_DATABASE_URI = 'postgresql://{user}:{passwd}@{host}:5432/{database}'.format(user=C_DATABASE_USER, passwd=C_DATABASE_PASS, host=C_DATABASE_IP, database=C_DATABASE_DB_NAME)

+# I was having issues with timeouts because the default connection pool is 5 connections.
+engine = create_engine(SQLALCHEMY_DATABASE_URI, pool_size = 20)

-engine = create_engine(SQLALCHEMY_DATABASE_URI)
 SessionFactory = sessionmaker(bind=engine)
 session = scoped_session(SessionFactory)
 # session = Session()
@ -197,7 +198,7 @@ class Releases(Base):
 	file          = association_proxy('file_rel',      'files',    creator=file_creator)

 	__table_args__ = (
-		UniqueConstraint('postid'),
+		UniqueConstraint('postid', 'source'),
 		)


--- a/gelbooruFetch.py
+++ b/gelbooruFetch.py
@ -15,9 +15,9 @@ import os.path
 import time
 import datetime

-class DanbooruFetcher(object):
+class GelbooruFetcher(object):
 	def __init__(self):
-		self.log = logging.getLogger("Main.Danbooru")
+		self.log = logging.getLogger("Main.Gelbooru")
 		self.wg = webFunctions.WebGetRobust()

 		# db.session = db.Session()
@ -44,26 +44,27 @@ class DanbooruFetcher(object):

 	def extractTags(self, job, tagsection):

-		characterlis = tagsection.find_all('li', class_='category-4')
-		artistlis    = tagsection.find_all('li', class_='category-1')
-		taglis       = tagsection.find_all('li', class_='category-0')
+		taglis       = tagsection.find_all('li', class_='tag-type-general')
+		characterlis = tagsection.find_all('li', class_='tag-type-character')
+		artistlis    = tagsection.find_all('li', class_='tag-type-artist')


 		tags = []
 		for tagli in taglis:
-			tag = tagli.find('a', class_="search-tag").get_text()
+			tag = tagli.find_all('a')[1].get_text()
 			tags.append(tag)

 		artists = []
 		for artistli in artistlis:
-			artist = artistli.find('a', class_="search-tag").get_text()
+			artist = artistli.find_all('a')[1].get_text()
 			artists.append(artist)

 		characters = []
 		for characterli in characterlis:
-			character = characterli.find('a', class_="search-tag").get_text()
+			character = characterli.find_all('a')[1].get_text()
 			characters.append(character)

+
 		for tag in tags:
 			if tag not in job.tags:
 				job.tags.append(tag)
@ -75,14 +76,19 @@ class DanbooruFetcher(object):
 				job.character.append(character)

 	def getxy(self, instr):
-		found = re.search(r"\((\d+)x(\d+)\)", instr)
+		found = re.search(r"(\d+)x(\d+)", instr)
 		x, y = found.groups()
 		return x, y

 	def extractInfo(self, job, infosection):
-		imgurl = None
+
 		for li in infosection.find_all("li"):
-			rawt = li.get_text()
+			rawt = li.get_text().strip()
+			if not rawt:
+				continue
+			if not ":" in rawt:
+				print("rawt: '{}'".format(rawt))
+
 			name, val = rawt.split(":", 1)

 			name = name.strip()
@ -93,41 +99,43 @@ class DanbooruFetcher(object):
 			elif name == 'Favorites':
 				job.favorites = val
 			elif name == 'Score':
-				job.score = val
-			elif name == 'Date':
+				job.score = val.split()[0]
+			elif name == 'Posted':
 				cal = parsedatetime.Calendar()
-				tstruct, pstat = cal.parse(val)
+				itemdate = val.split("at")[0]
+				tstruct, pstat = cal.parse(itemdate)
 				assert pstat == 1 or pstat == 2
 				job.posted = datetime.datetime.fromtimestamp(time.mktime(tstruct))
 			elif name == 'Size':
-				if not '\n' in val:
-					return False
-				fsize, res = val.split("\n")
-				fsize, res = fsize.strip(), res.strip()
-				job.imgx, job.imgy = self.getxy(res)
+				job.imgx, job.imgy = self.getxy(val)

-				link = li.find("a")
-				if link:
-					imgurl = link['href']

 			elif name == 'Status':
 				job.status = val
 				# Do not try to fetch things that are banned (e.g. removed)
 				if val == 'Banned':
 					job.dlstate=-2
-			elif name in ['Approver', 'ID', 'Source', 'Uploader']:
+			elif name in ['Approver', 'Id', 'Source', 'Uploader']:
 				pass
 			else:
 				self.log.warning("Unknown item key-value:")
 				self.log.warning("	'{}' -> '{}'".format(name, val))
-		return imgurl
+
+	def getImageUrl(self, soup):
+		img = soup.find('a', text='Original image')
+		return img['href']
+
 	def extractMeta(self, job, soup):
-		tagsection = soup.find('section', id='tag-list')
+		sidebar = soup.find('div', class_='sidebar4').find('div', class_='sidebar3')
+
+		tagsection = sidebar.find('ul', id='tag-sidebar')
 		assert tagsection
-		infosection = soup.find('section', id='post-information')
+		infosection = sidebar.find('div', id='stats')
 		assert infosection
 		self.extractTags(job, tagsection)
-		imgurl = self.extractInfo(job, infosection)
+		self.extractInfo(job, infosection)
+		imgurl = self.getImageUrl(soup)
+
 		return imgurl


@ -191,23 +199,32 @@ class DanbooruFetcher(object):
 		# print(fname)

 	def processJob(self, job):
-		pageurl = 'https://danbooru.donmai.us/posts/{}'.format(job.postid)
-		try:
-			soup = self.wg.getSoup(pageurl)
-		except urllib.error.URLError:
-			job.dlstate=-1
-			db.session.commit()
-			return
+		pageurl = 'http://gelbooru.com/index.php?page=post&s=view&id={}'.format(job.postid)
+		while 1:
+			try:
+				soup = self.wg.getSoup(pageurl)
+				if 'You are viewing an advertisement' in soup.get_text():
+					self.log.warning("Working around advertisement. Sleeping 10 seconds")
+					time.sleep(10)
+				else:
+					break
+			except urllib.error.URLError:
+				job.dlstate=-1
+				db.session.commit()
+				return

-		text = soup.get_text()
-		if 'You need a gold account to see this image.' in text:
-			job.dlstate=-3
-			db.session.commit()
-			return
-		if 'This post was deleted for the following reasons' in text:
+		if 'Gelbooru - Image List' in soup.title.get_text():
+			self.log.warn("Image has been removed.")
 			job.dlstate=-4
 			db.session.commit()
 			return
+
+		# text = soup.get_text()
+		# if 'You need a gold account to see this image.' in text:
+		# 	job.dlstate=-3
+		# 	db.session.commit()
+		# 	return
+
 		err = 0
 		while err < 5:
 			try:
@ -238,7 +255,7 @@ class DanbooruFetcher(object):

 def run(indice):
 	print("Runner {}!".format(indice))
-	fetcher = DanbooruFetcher()
+	fetcher = GelbooruFetcher()
 	remainingTasks = True

 	try:
--- a/main.py
+++ b/main.py
@ -1,19 +1,22 @@

+import sys
 import database as db
 import logSetup
 logSetup.initLogging()

 import danbooruFetch
+import gelbooruFetch
 import runstate
 import concurrent.futures

 THREADS = 1
-THREADS = 25
+THREADS = 30

 def insertDanbooruStartingPoints():

-	tmp = db.session.query(db.Releases) \
-		.filter(db.Releases.postid == 1)  \
+	tmp = db.session.query(db.Releases)           \
+		.filter(db.Releases.postid == 1)          \
+		.filter(db.Releases.source == 'Danbooru') \
 		.count()
 	if not tmp:
 		for x in range(2070000):
@ -25,6 +28,31 @@ def insertDanbooruStartingPoints():
 				print("Flushed.")
 	db.session.commit()

+def insertGelbooruStartingPoints():
+
+	tmp = db.session.query(db.Releases)           \
+		.filter(db.Releases.postid == 1)          \
+		.filter(db.Releases.source == 'Gelbooru') \
+		.count()
+	if not tmp:
+
+		print("Building insert data structure")
+		dat = [{"dlstate" : 0, "postid" : x, "source" : 'Gelbooru'} for x in range(2900000)]
+		print("Building insert query")
+		q = db.Releases.__table__.insert().values(dat)
+		print("Built. Doing insert.")
+		db.engine.execute(q)
+		print("Done.")
+		# for x in :
+
+			# new = db.Releases(dlstate=0, postid=x, source='Gelbooru')
+			# # db.session.add(new)
+			# if x % 100000 == 0:
+			# 	print("Loop ", x, "flushing...")
+			# 	db.session.flush()
+			# 	print("Flushed.")
+	db.session.commit()
+

 def resetDlstate():
 	tmp = db.session.query(db.Releases) \
@ -35,13 +63,19 @@ def resetDlstate():

 def go():
 	insertDanbooruStartingPoints()
+	insertGelbooruStartingPoints()
 	resetDlstate()

+
+
+
 	executor = concurrent.futures.ThreadPoolExecutor(max_workers=THREADS)
 	try:
 		# for x in range(2):
-		for x in range(THREADS):
+		for x in range(THREADS//2):
 			executor.submit(danbooruFetch.run, x)
+		for x in range(THREADS//2):
+			executor.submit(gelbooruFetch.run, x)
 		executor.shutdown()
 	except KeyboardInterrupt:
 		print("Waiting for executor.")