It now scrapinates. Whoo?

2015-07-11 15:48:20 -07:00 · 2015-07-11 15:48:20 -07:00 · 2d67b291aa
parent d0a718886c
commit 2d67b291aa
4 changed files with 217 additions and 37 deletions
--- a/database.py
+++ b/database.py
@ -1,26 +1,9 @@

-# import rpc

-# import task_exceptions
-# import deps.ExContentLoader
-# import deps.ContentLoader
-# import deps.LibraryContentEnqueue
-# import deps.LibraryContentEnqueue
-# import deps.ExExtract
-# import deps.nameTools as nt
-# import os.path
-# import traceback
-# import string
-# import settings
-# import time
-# import pprint
-# import traceback
-# from sqlalchemy.orm import Session
 from sqlalchemy import create_engine
 from sqlalchemy.orm import sessionmaker
 from sqlalchemy.orm import scoped_session
 from sqlalchemy import Table
-# from sqlalchemy import MetaData

 from sqlalchemy import Column
 from sqlalchemy import Integer
@ -36,8 +19,6 @@ from sqlalchemy.schema import UniqueConstraint

 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.ext.associationproxy import association_proxy
-# from  sqlalchemy.sql.expression import func
-# from citext import CIText

 # Patch in knowledge of the citext type, so it reflects properly.
 from sqlalchemy.dialects.postgresql.base import ischema_names
@ -54,8 +35,8 @@ SQLALCHEMY_DATABASE_URI = 'postgresql://{user}:{passwd}@{host}:5432/{database}'.

 engine = create_engine(SQLALCHEMY_DATABASE_URI)
 SessionFactory = sessionmaker(bind=engine)
-Session = scoped_session(SessionFactory)
-session = Session()
+session = scoped_session(SessionFactory)
+# session = Session()
 Base = declarative_base()

 db_tags_link = Table(
@ -151,6 +132,7 @@ class Releases(Base):
 	dlstate     = Column(Integer, nullable=False, index=True)
 	postid      = Column(Integer, nullable=False, index=True)

+	source      = Column(citext.CIText, nullable=False, index=True)

 	fsize       = Column(BigInteger)
 	score       = Column(Float)
@ -167,6 +149,7 @@ class Releases(Base):
 	filepath    = Column(Text)

 	status      = Column(Text)
+	rating      = Column(Text)

 	tags_rel      = relationship('Tags',       secondary=lambda: db_tags_link)
 	character_rel = relationship('Characters', secondary=lambda: db_chars_link)
--- a/fetcher.py
+++ b/fetcher.py
@ -5,18 +5,27 @@ import logging
 import traceback
 import sqlalchemy.exc
 import runstate
+import urllib.error
+import urllib.parse
+import re
+import parsedatetime
+import os
+import settings
+import os.path
+import time
+import datetime

 class DanbooruFetcher(object):
 	def __init__(self):
 		self.log = logging.getLogger("Main.Danbooru")
 		self.wg = webFunctions.WebGetRobust()

-		self.session = db.Session()
+		# db.session = db.Session()

 	def get_job(self):
 		while 1:
 			try:
-				job = self.session.query(db.Releases)   \
+				job = db.session.query(db.Releases)   \
 					.filter(db.Releases.dlstate == 0) \
 					.order_by(db.Releases.postid)     \
 					.limit(1)                         \
@ -24,21 +33,203 @@ class DanbooruFetcher(object):
 				if job == None:
 					return None
 				job.dlstate = 1
-				self.session.commit()
+				db.session.commit()
 				return job
 			except sqlalchemy.exc.DatabaseError:
 				self.log.warning("Error when getting job. Probably a concurrency issue.")
 				self.log.warning("Trying again.")
 				for line in traceback.format_exc().split("\n"):
 					self.log.warning(line)
-				self.session.rollback()
+				db.session.rollback()
+
+	def extractTags(self, job, tagsection):
+
+		characterlis = tagsection.find_all('li', class_='category-4')
+		artistlis    = tagsection.find_all('li', class_='category-1')
+		taglis       = tagsection.find_all('li', class_='category-0')
+
+
+		tags = []
+		for tagli in taglis:
+			tag = tagli.find('a', class_="search-tag").get_text()
+			tags.append(tag)
+
+		artists = []
+		for artistli in artistlis:
+			artist = artistli.find('a', class_="search-tag").get_text()
+			artists.append(artist)
+
+		characters = []
+		for characterli in characterlis:
+			character = characterli.find('a', class_="search-tag").get_text()
+			characters.append(character)
+
+		for tag in tags:
+			if tag not in job.tags:
+				job.tags.append(tag)
+		for artist in artists:
+			if artist not in job.artist:
+				job.artist.append(artist)
+		for character in characters:
+			if character not in job.character:
+				job.character.append(character)
+
+	def getxy(self, instr):
+		found = re.search(r"\((\d+)x(\d+)\)", instr)
+		x, y = found.groups()
+		return x, y
+
+	def extractInfo(self, job, infosection):
+		imgurl = None
+		for li in infosection.find_all("li"):
+			rawt = li.get_text()
+			name, val = rawt.split(":", 1)
+
+			name = name.strip()
+			val = val.strip()
+
+			if name == 'Rating':
+				job.rating = val
+			elif name == 'Favorites':
+				job.favorites = val
+			elif name == 'Score':
+				job.score = val
+			elif name == 'Date':
+				cal = parsedatetime.Calendar()
+				tstruct, pstat = cal.parse(val)
+				assert pstat == 1 or pstat == 2
+				job.posted = datetime.datetime.fromtimestamp(time.mktime(tstruct))
+			elif name == 'Size':
+				if not '\n' in val:
+					return False
+				fsize, res = val.split("\n")
+				fsize, res = fsize.strip(), res.strip()
+				job.imgx, job.imgy = self.getxy(res)
+
+				link = li.find("a")
+				if link:
+					imgurl = link['href']
+
+			elif name == 'Status':
+				job.status = val
+				# Do not try to fetch things that are banned (e.g. removed)
+				if val == 'Banned':
+					job.dlstate=-2
+			elif name in ['Approver', 'ID', 'Source', 'Uploader']:
+				pass
+			else:
+				self.log.warning("Unknown item key-value:")
+				self.log.warning("	'{}' -> '{}'".format(name, val))
+		return imgurl
+	def extractMeta(self, job, soup):
+		tagsection = soup.find('section', id='tag-list')
+		assert tagsection
+		infosection = soup.find('section', id='post-information')
+		assert infosection
+		self.extractTags(job, tagsection)
+		imgurl = self.extractInfo(job, infosection)
+		return imgurl
+
+
+
+	def saveFile(self, filename, fileCont):
+		if not os.path.exists(settings.storeDir):
+			self.log.warn("Cache directory for book items did not exist. Creating")
+			self.log.warn("Directory at path '%s'", settings.storeDir)
+			os.makedirs(settings.storeDir)
+
+
+		fHash, ext = os.path.splitext(filename)
+
+		ext   = ext.lower()
+		fHash = fHash.upper()
+
+		# use the first 3 chars of the hash for the folder name.
+		# Since it's hex-encoded, that gives us a max of 2^12 bits of
+		# directories, or 4096 dirs.
+		dirName = fHash[:3]
+
+		dirPath = os.path.join(settings.storeDir, dirName)
+		if not os.path.exists(dirPath):
+			os.makedirs(dirPath)
+
+		ext = os.path.splitext(filename)[-1]
+
+		ext   = ext.lower()
+		fHash = fHash.upper()
+
+		# The "." is part of the ext.
+		filename = '{filename}{ext}'.format(filename=fHash, ext=ext)
+
+		fqpath = os.path.join(dirPath, filename)
+		fqpath = os.path.abspath(fqpath)
+		if not fqpath.startswith(settings.storeDir):
+			raise ValueError("Generating the file path to save a cover produced a path that did not include the storage directory?")
+
+		locpath = fqpath[len(settings.storeDir):]
+
+		with open(fqpath, "wb") as fp:
+			fp.write(fileCont)
+
+		return locpath
+
+
+	def fetchImage(self, job, url, srcurl):
+		url = urllib.parse.urljoin(srcurl, url)
+		fname = url.split("/")[-1]
+
+
+		cont = self.wg.getpage(url, addlHeaders={'Referer':srcurl})
+
+		fpath = self.saveFile(fname, cont)
+		self.log.info("Saved file to path: '%s'", fpath)
+
+		job.filename = fname
+		job.filepath = fpath
+		job.dlstate  = 2
+		db.session.commit()
+		# print(fname)
+
+	def processJob(self, job):
+		pageurl = 'https://danbooru.donmai.us/posts/{}'.format(job.postid)
+		try:
+			soup = self.wg.getSoup(pageurl)
+		except urllib.error.URLError:
+			job.dlstate=-1
+			db.session.commit()
+			return
+
+		text = soup.get_text()
+		if 'You need a gold account to see this image.' in text:
+			job.dlstate=-3
+			db.session.commit()
+			return
+		if 'This post was deleted for the following reasons' in text:
+			job.dlstate=-4
+			db.session.commit()
+			return
+		err = 0
+		while err < 5:
+			try:
+				imgurl = self.extractMeta(job, soup)
+				if imgurl:
+					self.fetchImage(job, imgurl, pageurl)
+				else:
+					self.log.info("No image found for URL: '%s'", pageurl)
+					job.dlstate=-5
+				break
+			except sqlalchemy.exc.IntegrityError:
+				err += 1
+				db.session.rollback()
+
+

 	def retreiveItem(self):
 		job = self.get_job()
 		if not job:
 			return False

-
+		self.processJob(job)
 		return True


@ -48,11 +239,14 @@ class DanbooruFetcher(object):
 def run(indice):
 	print("Runner {}!".format(indice))
 	fetcher = DanbooruFetcher()
-	job = True
+	remainingTasks = True

 	try:
-		while job and runstate.run:
-			job = fetcher.get_job()
-			print("Have job: ", job, job.postid)
+		while remainingTasks and runstate.run:
+			remainingTasks = fetcher.retreiveItem()
 	except KeyboardInterrupt:
 		return
+	except:
+		print("Unhandled exception!")
+		traceback.print_exc()
+		raise
--- a/logSetup.py
+++ b/logSetup.py
@ -2,7 +2,7 @@

 import logging
 import colorama as clr
-
+import threading
 import os.path
 import sys
 import time
@ -35,9 +35,11 @@ class ColourHandler(logging.Handler):
 		# print record.name

 		segments = record.name.split(".")
+		tname = threading.current_thread().name
 		if segments[0] == "Main" and len(segments) > 1:
-			segments.pop(0)
-			segments[0] = "Main."+segments[0]
+			# segments.pop(0)
+			# segments[0] = "Main."+segments[0]
+			segments[0] = "Main."+tname

 		nameList = []

--- a/main.py
+++ b/main.py
@ -7,7 +7,8 @@ import fetcher
 import runstate
 import concurrent.futures

-THREADS = 10
+THREADS = 1
+THREADS = 25

 def insertStartingPoints():

@ -16,7 +17,7 @@ def insertStartingPoints():
 		.count()
 	if not tmp:
 		for x in range(2070000):
-			new = db.Releases(dlstate=0, postid=x)
+			new = db.Releases(dlstate=0, postid=x, source='Danbooru')
 			db.session.add(new)
 			if x % 10000 == 0:
 				print("Loop ", x, "flushing...")
@ -38,8 +39,8 @@ def go():

 	executor = concurrent.futures.ThreadPoolExecutor(max_workers=THREADS)
 	try:
-		# for x in range(THREADS):
-		for x in range(1):
+		# for x in range(2):
+		for x in range(THREADS):
 			executor.submit(fetcher.run, x)
 		executor.shutdown()
 	except KeyboardInterrupt: