Add sites as suggested in https://github.com/fake-name/DanbooruScraper/issues/2

2017-11-22 23:27:17 -08:00 · 2017-11-22 23:27:17 -08:00 · 6b5daa3c4a
parent 0570d20d7e
commit 6b5daa3c4a
7 changed files with 517 additions and 20 deletions
--- a/scraper/modules/KonaChanFetch.py
+++ b/scraper/modules/KonaChanFetch.py
@ -54,11 +54,6 @@ class KonaChanFetcher(scraper.fetchBase.AbstractFetcher):
 			character = characterli.find_all('a')[-1].get_text()
 			characters.append(character)

-		# print("Meta:")
-		# print("tags:", tags)
-		# print("artists:", artists)
-		# print("characters:", characters)
-
 		for tag in tags:
 			if tag not in job.tags:
 				job.tags.append(tag)
--- a/scraper/modules/danbooruFetch.py
+++ b/scraper/modules/danbooruFetch.py
@ -27,9 +27,13 @@ class DanbooruFetcher(scraper.fetchBase.AbstractFetcher):
 		characterlis = tagsection.find_all('li', class_='category-4')
 		artistlis    = tagsection.find_all('li', class_='category-1')
 		taglis       = tagsection.find_all('li', class_='category-0')
-
+		copyrlis     = tagsection.find_all('li', class_='category-3')

 		tags = []
+		for copyrli in copyrlis:
+			tag = copyrli.find_all('a')[-1].get_text()
+			tags.append("copyright " + tag)
+
 		for tagli in taglis:
 			tag = tagli.find('a', class_="search-tag").get_text()
 			tags.append(tag)
--- a/scraper/modules/gelbooruFetch.py
+++ b/scraper/modules/gelbooruFetch.py
@ -24,30 +24,38 @@ class GelbooruFetcher(scraper.fetchBase.AbstractFetcher):

 		# db.session = db.Session()

-
 	def extractTags(self, job, tagsection):

-		taglis       = tagsection.find_all('li', class_='tag-type-general')
 		characterlis = tagsection.find_all('li', class_='tag-type-character')
+		specieslis   = tagsection.find_all('li', class_='tag-type-species')
+		copyrlis     = tagsection.find_all('li', class_='tag-type-copyright')
 		artistlis    = tagsection.find_all('li', class_='tag-type-artist')
+		taglis       = tagsection.find_all('li', class_='tag-type-general')


 		tags = []
 		for tagli in taglis:
-			tag = tagli.find_all('a')[1].get_text()
+			tag = tagli.find_all('a')[-1].get_text()
 			tags.append(tag)

+		for speciesli in specieslis:
+			tag = speciesli.find_all('a')[-1].get_text()
+			tags.append("species " + tag)
+
+		for copyrli in copyrlis:
+			tag = copyrli.find_all('a')[-1].get_text()
+			tags.append("copyright " + tag)
+
 		artists = []
 		for artistli in artistlis:
-			artist = artistli.find_all('a')[1].get_text()
+			artist = artistli.find_all('a')[-1].get_text()
 			artists.append(artist)

 		characters = []
 		for characterli in characterlis:
-			character = characterli.find_all('a')[1].get_text()
+			character = characterli.find_all('a')[-1].get_text()
 			characters.append(character)

-
 		for tag in tags:
 			if tag not in job.tags:
 				job.tags.append(tag)
@ -113,11 +121,9 @@ class GelbooruFetcher(scraper.fetchBase.AbstractFetcher):
 		return img['href']

 	def extractMeta(self, job, soup):
-		sidebar = soup.find('div', class_='sidebar4').find('div', class_='sidebar3')
-
-		tagsection = sidebar.find('ul', id='tag-sidebar')
+		tagsection = soup.find('ul', id='tag-sidebar')
 		assert tagsection
-		infosection = sidebar.find('div', id='stats')
+		infosection = soup.find('div', id='stats')
 		assert infosection
 		self.extractTags(job, tagsection)
 		self.extractInfo(job, infosection)
--- a/scraper/modules/r34xxxScrape.py
+++ b/scraper/modules/r34xxxScrape.py
@ -25,23 +25,33 @@ class R34xxxFetcher(scraper.fetchBase.AbstractFetcher):
 	def extractTags(self, job, tagsection):

 		characterlis = tagsection.find_all('li', class_='tag-type-character')
+		specieslis   = tagsection.find_all('li', class_='tag-type-species')
+		copyrlis     = tagsection.find_all('li', class_='tag-type-copyright')
 		artistlis    = tagsection.find_all('li', class_='tag-type-artist')
 		taglis       = tagsection.find_all('li', class_='tag-type-general')


 		tags = []
 		for tagli in taglis:
-			tag = tagli.find('a').get_text()
+			tag = tagli.find_all('a')[-1].get_text()
 			tags.append(tag)

+		for speciesli in specieslis:
+			tag = speciesli.find_all('a')[-1].get_text()
+			tags.append("species " + tag)
+
+		for copyrli in copyrlis:
+			tag = copyrli.find_all('a')[-1].get_text()
+			tags.append("copyright " + tag)
+
 		artists = []
 		for artistli in artistlis:
-			artist = artistli.find('a').get_text()
+			artist = artistli.find_all('a')[-1].get_text()
 			artists.append(artist)

 		characters = []
 		for characterli in characterlis:
-			character = characterli.find('a').get_text()
+			character = characterli.find_all('a')[-1].get_text()
 			characters.append(character)

 		for tag in tags:
--- a/scraper/modules/tbibFetch.py
+++ b/scraper/modules/tbibFetch.py
@ -0,0 +1,240 @@
+
+import traceback
+import urllib.error
+import urllib.parse
+import re
+import time
+import datetime
+
+import sqlalchemy.exc
+import parsedatetime
+
+import scraper.runstate
+import scraper.database as db
+import scraper.fetchBase
+
+class TbibFetcher(scraper.fetchBase.AbstractFetcher):
+
+	pluginkey         = 'TBIB'
+	loggerpath        = "Main.TBIB"
+	content_count_max = 6360000
+
+	def __init__(self):
+		super().__init__()
+
+		# db.session = db.Session()
+
+
+	def extractTags(self, job, tagsection):
+
+		characterlis = tagsection.find_all('li', class_='tag-type-character')
+		specieslis   = tagsection.find_all('li', class_='tag-type-species')
+		copyrlis     = tagsection.find_all('li', class_='tag-type-copyright')
+		artistlis    = tagsection.find_all('li', class_='tag-type-artist')
+		taglis       = tagsection.find_all('li', class_='tag-type-general')
+
+
+		tags = []
+		for tagli in taglis:
+			tag = tagli.find_all('a')[-1].get_text()
+			tags.append(tag)
+
+		for speciesli in specieslis:
+			tag = speciesli.find_all('a')[-1].get_text()
+			tags.append("species " + tag)
+
+		for copyrli in copyrlis:
+			tag = copyrli.find_all('a')[-1].get_text()
+			tags.append("copyright " + tag)
+
+		artists = []
+		for artistli in artistlis:
+			artist = artistli.find_all('a')[-1].get_text()
+			artists.append(artist)
+
+		characters = []
+		for characterli in characterlis:
+			character = characterli.find_all('a')[-1].get_text()
+			characters.append(character)
+
+		for tag in tags:
+			if tag not in job.tags:
+				job.tags.append(tag)
+		for artist in artists:
+			if artist not in job.artist:
+				job.artist.append(artist)
+		for character in characters:
+			if character not in job.character:
+				job.character.append(character)
+
+	def getxy(self, instr):
+		found = re.search(r"(\d+)x(\d+)", instr)
+		x, y = found.groups()
+		return x, y
+
+	def extractInfo(self, job, infosection):
+
+		for li in infosection.find_all("li"):
+			rawt = li.get_text().strip()
+			if not rawt:
+				continue
+			if not ":" in rawt:
+				print("rawt: '{}'".format(rawt))
+
+			name, val = rawt.split(":", 1)
+
+			name = name.strip()
+			val = val.strip()
+
+			if name == 'Rating':
+				job.rating = val
+			elif name == 'Favorites':
+				job.favorites = val
+			elif name == 'Score':
+				job.score = val.split()[0]
+			elif name == 'Posted':
+				cal = parsedatetime.Calendar()
+				itemdate =      val.split("at")[0]
+				itemdate = itemdate.split("by")[0]
+				print("itemdate", itemdate)
+				tstruct, pstat = cal.parse(itemdate)
+				print("Ret: ", pstat, tstruct)
+				assert pstat == 1 or pstat == 2 or pstat == 3
+				job.posted = datetime.datetime.fromtimestamp(time.mktime(tstruct))
+			elif name == 'Size':
+				job.imgx, job.imgy = self.getxy(val)
+
+
+			elif name == 'Status':
+				job.status = val
+				# Do not try to fetch things that are banned (e.g. removed)
+				if val == 'Banned':
+					job.state = 'removed'
+					job.err_str = 'item banned'
+			elif name in ['Approver', 'Id', 'Source', 'Uploader']:
+				pass
+			else:
+				self.log.warning("Unknown item key-value:")
+				self.log.warning("	'{}' -> '{}'".format(name, val))
+
+	def getImageUrl(self, soup):
+		img = soup.find('a', text='Original image')
+		return img['href']
+
+	def extractMeta(self, job, soup):
+		sidebar = soup.find('div', class_='sidebar')
+
+		tagsection = sidebar.find('ul', id='tag-sidebar')
+		assert tagsection
+		infosection = sidebar.find('div', id='stats')
+		assert infosection
+		self.extractTags(job, tagsection)
+		self.extractInfo(job, infosection)
+		imgurl = self.getImageUrl(soup)
+
+		return imgurl
+
+
+
+	def fetchImage(self, job, url, srcurl):
+		url = urllib.parse.urljoin(srcurl, url)
+		fname = url.split("/")[-1]
+
+
+		cont = self.wg.getpage(url, addlHeaders={'Referer':srcurl})
+
+		fpath = self.saveFileRow(job, fname, cont)
+		self.log.info("Saved file to path: '%s'", fpath)
+
+		job.filename = fname
+		job.filepath = fpath
+		job.state    = 'complete'
+		db.session.commit()
+		# print(fname)
+
+	def processJob(self, job):
+		pageurl = 'http://tbib.org/index.php?page=post&s=view&id={}'.format(job.postid)
+		while 1:
+			try:
+				soup = self.wg.getSoup(pageurl)
+				if 'You are viewing an advertisement' in soup.get_text():
+					self.log.warning("Working around advertisement. Sleeping 10 seconds")
+					time.sleep(13)
+				else:
+					break
+			except urllib.error.URLError:
+				job.state = 'error'
+				job.err_str = 'failure fetching container page'
+				db.session.commit()
+				return
+
+		if 'Gelbooru - Image List' in soup.title.get_text():
+			self.log.warning("Image has been removed.")
+			job.state = 'removed'
+			job.err_str = 'image has been removed'
+			db.session.commit()
+			return
+
+		if 'This post was deleted. Reason: Duplicate of' in soup.get_text():
+			self.log.warning("Image has been removed.")
+			job.state = 'removed'
+			job.err_str = 'image has been removed because it was a duplicate'
+			db.session.commit()
+			return
+
+
+		err = 0
+		while err < 5:
+			try:
+				imgurl = self.extractMeta(job, soup)
+				if imgurl:
+					self.fetchImage(job, imgurl, pageurl)
+				else:
+					self.log.info("No image found for URL: '%s'", pageurl)
+					job.state = 'error'
+					job.err_str = 'failed to find image!'
+				break
+			except sqlalchemy.exc.IntegrityError:
+				err += 1
+				db.session.rollback()
+			except urllib.error.URLError:
+				job.state = 'error'
+				job.err_str = 'failure fetching actual image'
+				db.session.commit()
+
+
+
+	def retreiveItem(self):
+		job = self.get_job()
+		if not job:
+			return False
+
+		self.processJob(job)
+		return True
+
+
+
+
+
+def run(indice):
+	print("Runner {}!".format(indice))
+	fetcher = GelbooruFetcher()
+	remainingTasks = True
+
+	try:
+		while remainingTasks and scraper.runstate.run:
+			remainingTasks = fetcher.retreiveItem()
+	except KeyboardInterrupt:
+		return
+	except:
+		print("Unhandled exception!")
+		traceback.print_exc()
+		raise
+
+if __name__ == '__main__':
+
+	import logSetup
+	logSetup.initLogging()
+
+	run(1)
+
--- a/scraper/modules/xbooruFetch.py
+++ b/scraper/modules/xbooruFetch.py
@ -0,0 +1,238 @@
+
+import traceback
+import urllib.error
+import urllib.parse
+import re
+import time
+import datetime
+
+import sqlalchemy.exc
+import parsedatetime
+
+import scraper.runstate
+import scraper.database as db
+import scraper.fetchBase
+
+class XBooruFetcher(scraper.fetchBase.AbstractFetcher):
+
+	pluginkey         = 'XBooru'
+	loggerpath        = "Main.XBooru"
+	content_count_max = 710000
+
+	def __init__(self):
+		super().__init__()
+
+		# db.session = db.Session()
+
+
+	def extractTags(self, job, tagsection):
+
+		characterlis = tagsection.find_all('li', class_='tag-type-character')
+		specieslis   = tagsection.find_all('li', class_='tag-type-species')
+		copyrlis     = tagsection.find_all('li', class_='tag-type-copyright')
+		artistlis    = tagsection.find_all('li', class_='tag-type-artist')
+		taglis       = tagsection.find_all('li', class_='tag-type-general')
+
+
+		tags = []
+		for tagli in taglis:
+			tag = tagli.find_all('a')[-1].get_text()
+			tags.append(tag)
+
+		for speciesli in specieslis:
+			tag = speciesli.find_all('a')[-1].get_text()
+			tags.append("species " + tag)
+
+		for copyrli in copyrlis:
+			tag = copyrli.find_all('a')[-1].get_text()
+			tags.append("copyright " + tag)
+
+		artists = []
+		for artistli in artistlis:
+			artist = artistli.find_all('a')[-1].get_text()
+			artists.append(artist)
+
+		characters = []
+		for characterli in characterlis:
+			character = characterli.find_all('a')[-1].get_text()
+			characters.append(character)
+
+		for tag in tags:
+			if tag not in job.tags:
+				job.tags.append(tag)
+		for artist in artists:
+			if artist not in job.artist:
+				job.artist.append(artist)
+		for character in characters:
+			if character not in job.character:
+				job.character.append(character)
+
+	def getxy(self, instr):
+		found = re.search(r"(\d+)x(\d+)", instr)
+		x, y = found.groups()
+		return x, y
+
+	def extractInfo(self, job, infosection):
+
+		for li in infosection.find_all("li"):
+			rawt = li.get_text().strip()
+			if not rawt:
+				continue
+			if not ":" in rawt:
+				print("rawt: '{}'".format(rawt))
+
+			name, val = rawt.split(":", 1)
+
+			name = name.strip()
+			val = val.strip()
+
+			if name == 'Rating':
+				job.rating = val
+			elif name == 'Favorites':
+				job.favorites = val
+			elif name == 'Score':
+				job.score = val.split()[0]
+			elif name == 'Posted':
+				cal = parsedatetime.Calendar()
+				itemdate =      val.split("at")[0]
+				itemdate = itemdate.split("by")[0]
+				print("itemdate", itemdate)
+				tstruct, pstat = cal.parse(itemdate)
+				print("Ret: ", pstat, tstruct)
+				assert pstat == 1 or pstat == 2 or pstat == 3
+				job.posted = datetime.datetime.fromtimestamp(time.mktime(tstruct))
+			elif name == 'Size':
+				job.imgx, job.imgy = self.getxy(val)
+
+
+			elif name == 'Status':
+				job.status = val
+				# Do not try to fetch things that are banned (e.g. removed)
+				if val == 'Banned':
+					job.state = 'removed'
+					job.err_str = 'item banned'
+			elif name in ['Approver', 'Id', 'Source', 'Uploader']:
+				pass
+			else:
+				self.log.warning("Unknown item key-value:")
+				self.log.warning("	'{}' -> '{}'".format(name, val))
+
+	def getImageUrl(self, soup):
+		img = soup.find('a', text='Original image')
+		return img['href']
+
+	def extractMeta(self, job, soup):
+		tagsection = soup.find('ul', id='tag-sidebar')
+		assert tagsection
+		infosection = soup.find('div', id='stats')
+		assert infosection
+		self.extractTags(job, tagsection)
+		self.extractInfo(job, infosection)
+		imgurl = self.getImageUrl(soup)
+
+		return imgurl
+
+
+
+	def fetchImage(self, job, url, srcurl):
+		url = urllib.parse.urljoin(srcurl, url)
+		fname = url.split("/")[-1]
+
+
+		cont = self.wg.getpage(url, addlHeaders={'Referer':srcurl})
+
+		fpath = self.saveFileRow(job, fname, cont)
+		self.log.info("Saved file to path: '%s'", fpath)
+
+		job.filename = fname
+		job.filepath = fpath
+		job.state    = 'complete'
+		db.session.commit()
+		# print(fname)
+
+	def processJob(self, job):
+		pageurl = 'http://tbib.org/index.php?page=post&s=view&id={}'.format(job.postid)
+		while 1:
+			try:
+				soup = self.wg.getSoup(pageurl)
+				if 'You are viewing an advertisement' in soup.get_text():
+					self.log.warning("Working around advertisement. Sleeping 10 seconds")
+					time.sleep(13)
+				else:
+					break
+			except urllib.error.URLError:
+				job.state = 'error'
+				job.err_str = 'failure fetching container page'
+				db.session.commit()
+				return
+
+		if 'Gelbooru - Image List' in soup.title.get_text():
+			self.log.warning("Image has been removed.")
+			job.state = 'removed'
+			job.err_str = 'image has been removed'
+			db.session.commit()
+			return
+
+		if 'This post was deleted. Reason: Duplicate of' in soup.get_text():
+			self.log.warning("Image has been removed.")
+			job.state = 'removed'
+			job.err_str = 'image has been removed because it was a duplicate'
+			db.session.commit()
+			return
+
+
+		err = 0
+		while err < 5:
+			try:
+				imgurl = self.extractMeta(job, soup)
+				if imgurl:
+					self.fetchImage(job, imgurl, pageurl)
+				else:
+					self.log.info("No image found for URL: '%s'", pageurl)
+					job.state = 'error'
+					job.err_str = 'failed to find image!'
+				break
+			except sqlalchemy.exc.IntegrityError:
+				err += 1
+				db.session.rollback()
+			except urllib.error.URLError:
+				job.state = 'error'
+				job.err_str = 'failure fetching actual image'
+				db.session.commit()
+
+
+
+	def retreiveItem(self):
+		job = self.get_job()
+		if not job:
+			return False
+
+		self.processJob(job)
+		return True
+
+
+
+
+
+def run(indice):
+	print("Runner {}!".format(indice))
+	fetcher = GelbooruFetcher()
+	remainingTasks = True
+
+	try:
+		while remainingTasks and scraper.runstate.run:
+			remainingTasks = fetcher.retreiveItem()
+	except KeyboardInterrupt:
+		return
+	except:
+		print("Unhandled exception!")
+		traceback.print_exc()
+		raise
+
+if __name__ == '__main__':
+
+	import logSetup
+	logSetup.initLogging()
+
+	run(1)
+
--- a/scraper/runner.py
+++ b/scraper/runner.py
@ -11,6 +11,8 @@ import scraper.modules.gelbooruFetch
 import scraper.modules.r34xxxScrape
 import scraper.modules.KonaChanFetch
 import scraper.modules.e621Scrape
+import scraper.modules.tbibFetch
+import scraper.modules.xbooruFetch


 # THREADS = 6
@ -19,10 +21,12 @@ THREADS = 15

 PLUGIN_CLASSES = [
 	scraper.modules.danbooruFetch.DanbooruFetcher,
-	scraper.modules.gelbooruFetch.GelbooruFetcher,
+	# scraper.modules.gelbooruFetch.GelbooruFetcher,
 	scraper.modules.r34xxxScrape.R34xxxFetcher,
 	scraper.modules.KonaChanFetch.KonaChanFetcher,
 	scraper.modules.e621Scrape.E621Fetcher,
+	scraper.modules.tbibFetch.TbibFetcher,
+	scraper.modules.xbooruFetch.XBooruFetcher,
 ]

 class RunEngine(object):