It now scrapinates. Whoo?

This commit is contained in:
Fake-Name 2015-07-11 15:48:20 -07:00
parent d0a718886c
commit 2d67b291aa
4 changed files with 217 additions and 37 deletions

View File

@ -1,26 +1,9 @@
# import rpc
# import task_exceptions
# import deps.ExContentLoader
# import deps.ContentLoader
# import deps.LibraryContentEnqueue
# import deps.LibraryContentEnqueue
# import deps.ExExtract
# import deps.nameTools as nt
# import os.path
# import traceback
# import string
# import settings
# import time
# import pprint
# import traceback
# from sqlalchemy.orm import Session
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.orm import scoped_session
from sqlalchemy import Table
# from sqlalchemy import MetaData
from sqlalchemy import Column
from sqlalchemy import Integer
@ -36,8 +19,6 @@ from sqlalchemy.schema import UniqueConstraint
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.ext.associationproxy import association_proxy
# from sqlalchemy.sql.expression import func
# from citext import CIText
# Patch in knowledge of the citext type, so it reflects properly.
from sqlalchemy.dialects.postgresql.base import ischema_names
@ -54,8 +35,8 @@ SQLALCHEMY_DATABASE_URI = 'postgresql://{user}:{passwd}@{host}:5432/{database}'.
engine = create_engine(SQLALCHEMY_DATABASE_URI)
SessionFactory = sessionmaker(bind=engine)
Session = scoped_session(SessionFactory)
session = Session()
session = scoped_session(SessionFactory)
# session = Session()
Base = declarative_base()
db_tags_link = Table(
@ -151,6 +132,7 @@ class Releases(Base):
dlstate = Column(Integer, nullable=False, index=True)
postid = Column(Integer, nullable=False, index=True)
source = Column(citext.CIText, nullable=False, index=True)
fsize = Column(BigInteger)
score = Column(Float)
@ -167,6 +149,7 @@ class Releases(Base):
filepath = Column(Text)
status = Column(Text)
rating = Column(Text)
tags_rel = relationship('Tags', secondary=lambda: db_tags_link)
character_rel = relationship('Characters', secondary=lambda: db_chars_link)

View File

@ -5,18 +5,27 @@ import logging
import traceback
import sqlalchemy.exc
import runstate
import urllib.error
import urllib.parse
import re
import parsedatetime
import os
import settings
import os.path
import time
import datetime
class DanbooruFetcher(object):
def __init__(self):
self.log = logging.getLogger("Main.Danbooru")
self.wg = webFunctions.WebGetRobust()
self.session = db.Session()
# db.session = db.Session()
def get_job(self):
while 1:
try:
job = self.session.query(db.Releases) \
job = db.session.query(db.Releases) \
.filter(db.Releases.dlstate == 0) \
.order_by(db.Releases.postid) \
.limit(1) \
@ -24,21 +33,203 @@ class DanbooruFetcher(object):
if job == None:
return None
job.dlstate = 1
self.session.commit()
db.session.commit()
return job
except sqlalchemy.exc.DatabaseError:
self.log.warning("Error when getting job. Probably a concurrency issue.")
self.log.warning("Trying again.")
for line in traceback.format_exc().split("\n"):
self.log.warning(line)
self.session.rollback()
db.session.rollback()
def extractTags(self, job, tagsection):
characterlis = tagsection.find_all('li', class_='category-4')
artistlis = tagsection.find_all('li', class_='category-1')
taglis = tagsection.find_all('li', class_='category-0')
tags = []
for tagli in taglis:
tag = tagli.find('a', class_="search-tag").get_text()
tags.append(tag)
artists = []
for artistli in artistlis:
artist = artistli.find('a', class_="search-tag").get_text()
artists.append(artist)
characters = []
for characterli in characterlis:
character = characterli.find('a', class_="search-tag").get_text()
characters.append(character)
for tag in tags:
if tag not in job.tags:
job.tags.append(tag)
for artist in artists:
if artist not in job.artist:
job.artist.append(artist)
for character in characters:
if character not in job.character:
job.character.append(character)
def getxy(self, instr):
found = re.search(r"\((\d+)x(\d+)\)", instr)
x, y = found.groups()
return x, y
def extractInfo(self, job, infosection):
imgurl = None
for li in infosection.find_all("li"):
rawt = li.get_text()
name, val = rawt.split(":", 1)
name = name.strip()
val = val.strip()
if name == 'Rating':
job.rating = val
elif name == 'Favorites':
job.favorites = val
elif name == 'Score':
job.score = val
elif name == 'Date':
cal = parsedatetime.Calendar()
tstruct, pstat = cal.parse(val)
assert pstat == 1 or pstat == 2
job.posted = datetime.datetime.fromtimestamp(time.mktime(tstruct))
elif name == 'Size':
if not '\n' in val:
return False
fsize, res = val.split("\n")
fsize, res = fsize.strip(), res.strip()
job.imgx, job.imgy = self.getxy(res)
link = li.find("a")
if link:
imgurl = link['href']
elif name == 'Status':
job.status = val
# Do not try to fetch things that are banned (e.g. removed)
if val == 'Banned':
job.dlstate=-2
elif name in ['Approver', 'ID', 'Source', 'Uploader']:
pass
else:
self.log.warning("Unknown item key-value:")
self.log.warning(" '{}' -> '{}'".format(name, val))
return imgurl
def extractMeta(self, job, soup):
tagsection = soup.find('section', id='tag-list')
assert tagsection
infosection = soup.find('section', id='post-information')
assert infosection
self.extractTags(job, tagsection)
imgurl = self.extractInfo(job, infosection)
return imgurl
def saveFile(self, filename, fileCont):
if not os.path.exists(settings.storeDir):
self.log.warn("Cache directory for book items did not exist. Creating")
self.log.warn("Directory at path '%s'", settings.storeDir)
os.makedirs(settings.storeDir)
fHash, ext = os.path.splitext(filename)
ext = ext.lower()
fHash = fHash.upper()
# use the first 3 chars of the hash for the folder name.
# Since it's hex-encoded, that gives us a max of 2^12 bits of
# directories, or 4096 dirs.
dirName = fHash[:3]
dirPath = os.path.join(settings.storeDir, dirName)
if not os.path.exists(dirPath):
os.makedirs(dirPath)
ext = os.path.splitext(filename)[-1]
ext = ext.lower()
fHash = fHash.upper()
# The "." is part of the ext.
filename = '{filename}{ext}'.format(filename=fHash, ext=ext)
fqpath = os.path.join(dirPath, filename)
fqpath = os.path.abspath(fqpath)
if not fqpath.startswith(settings.storeDir):
raise ValueError("Generating the file path to save a cover produced a path that did not include the storage directory?")
locpath = fqpath[len(settings.storeDir):]
with open(fqpath, "wb") as fp:
fp.write(fileCont)
return locpath
def fetchImage(self, job, url, srcurl):
url = urllib.parse.urljoin(srcurl, url)
fname = url.split("/")[-1]
cont = self.wg.getpage(url, addlHeaders={'Referer':srcurl})
fpath = self.saveFile(fname, cont)
self.log.info("Saved file to path: '%s'", fpath)
job.filename = fname
job.filepath = fpath
job.dlstate = 2
db.session.commit()
# print(fname)
def processJob(self, job):
pageurl = 'https://danbooru.donmai.us/posts/{}'.format(job.postid)
try:
soup = self.wg.getSoup(pageurl)
except urllib.error.URLError:
job.dlstate=-1
db.session.commit()
return
text = soup.get_text()
if 'You need a gold account to see this image.' in text:
job.dlstate=-3
db.session.commit()
return
if 'This post was deleted for the following reasons' in text:
job.dlstate=-4
db.session.commit()
return
err = 0
while err < 5:
try:
imgurl = self.extractMeta(job, soup)
if imgurl:
self.fetchImage(job, imgurl, pageurl)
else:
self.log.info("No image found for URL: '%s'", pageurl)
job.dlstate=-5
break
except sqlalchemy.exc.IntegrityError:
err += 1
db.session.rollback()
def retreiveItem(self):
job = self.get_job()
if not job:
return False
self.processJob(job)
return True
@ -48,11 +239,14 @@ class DanbooruFetcher(object):
def run(indice):
print("Runner {}!".format(indice))
fetcher = DanbooruFetcher()
job = True
remainingTasks = True
try:
while job and runstate.run:
job = fetcher.get_job()
print("Have job: ", job, job.postid)
while remainingTasks and runstate.run:
remainingTasks = fetcher.retreiveItem()
except KeyboardInterrupt:
return
except:
print("Unhandled exception!")
traceback.print_exc()
raise

View File

@ -2,7 +2,7 @@
import logging
import colorama as clr
import threading
import os.path
import sys
import time
@ -35,9 +35,11 @@ class ColourHandler(logging.Handler):
# print record.name
segments = record.name.split(".")
tname = threading.current_thread().name
if segments[0] == "Main" and len(segments) > 1:
segments.pop(0)
segments[0] = "Main."+segments[0]
# segments.pop(0)
# segments[0] = "Main."+segments[0]
segments[0] = "Main."+tname
nameList = []

View File

@ -7,7 +7,8 @@ import fetcher
import runstate
import concurrent.futures
THREADS = 10
THREADS = 1
THREADS = 25
def insertStartingPoints():
@ -16,7 +17,7 @@ def insertStartingPoints():
.count()
if not tmp:
for x in range(2070000):
new = db.Releases(dlstate=0, postid=x)
new = db.Releases(dlstate=0, postid=x, source='Danbooru')
db.session.add(new)
if x % 10000 == 0:
print("Loop ", x, "flushing...")
@ -38,8 +39,8 @@ def go():
executor = concurrent.futures.ThreadPoolExecutor(max_workers=THREADS)
try:
# for x in range(THREADS):
for x in range(1):
# for x in range(2):
for x in range(THREADS):
executor.submit(fetcher.run, x)
executor.shutdown()
except KeyboardInterrupt: