From 1aa23060c63140f67b16bd2b49737cbd4fa82f44 Mon Sep 17 00:00:00 2001 From: Fake-Name Date: Mon, 31 Aug 2015 19:37:56 -0700 Subject: [PATCH] MOAR SITEZ --- fetchBase.py | 7 +- gelbooruFetch.py | 2 +- main.py | 43 ++++++++-- r34xxxScrape.py | 212 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 253 insertions(+), 11 deletions(-) create mode 100644 r34xxxScrape.py diff --git a/fetchBase.py b/fetchBase.py index 5fad567..e07b490 100644 --- a/fetchBase.py +++ b/fetchBase.py @@ -43,11 +43,12 @@ class AbstractFetcher(object, metaclass=abc.ABCMeta): while 1: try: job = db.session.query(db.Releases) \ - .filter(db.Releases.dlstate == 0) \ .filter(db.Releases.source == self.pluginkey) \ + .filter(db.Releases.dlstate == 0) \ .order_by(db.Releases.postid) \ - .limit(1) \ - .one() + .limit(1) + + job = job.scalar() if job == None: return None job.dlstate = 1 diff --git a/gelbooruFetch.py b/gelbooruFetch.py index c298f2d..4936f87 100644 --- a/gelbooruFetch.py +++ b/gelbooruFetch.py @@ -206,7 +206,7 @@ class GelbooruFetcher(object): soup = self.wg.getSoup(pageurl) if 'You are viewing an advertisement' in soup.get_text(): self.log.warning("Working around advertisement. Sleeping 10 seconds") - time.sleep(10) + time.sleep(13) else: break except urllib.error.URLError: diff --git a/main.py b/main.py index 228fdf0..3ad307b 100644 --- a/main.py +++ b/main.py @@ -6,6 +6,7 @@ logSetup.initLogging() import danbooruFetch import gelbooruFetch +import r34xxxScrape import runstate import concurrent.futures @@ -16,8 +17,9 @@ def insertDanbooruStartingPoints(): tmp = db.session.query(db.Releases) \ .filter(db.Releases.postid == 1) \ - .filter(db.Releases.source == 'Danbooru') \ - .count() + .filter(db.Releases.source == 'Danbooru') + + tmp = tmp.count() if not tmp: for x in range(2070000): new = db.Releases(dlstate=0, postid=x, source='Danbooru') @@ -52,6 +54,30 @@ def insertGelbooruStartingPoints(): # db.session.flush() # print("Flushed.") db.session.commit() +def insertR34xxxStartingPoints(): + + tmp = db.session.query(db.Releases) \ + .filter(db.Releases.postid == 1) \ + .filter(db.Releases.source == 'Rule34.xxx') \ + .count() + if not tmp: + + print("Building insert data structure") + dat = [{"dlstate" : 0, "postid" : x, "source" : 'Rule34.xxx'} for x in range(1844200)] + print("Building insert query") + q = db.Releases.__table__.insert().values(dat) + print("Built. Doing insert.") + db.engine.execute(q) + print("Done.") + # for x in : + + # new = db.Releases(dlstate=0, postid=x, source='Gelbooru') + # # db.session.add(new) + # if x % 100000 == 0: + # print("Loop ", x, "flushing...") + # db.session.flush() + # print("Flushed.") + db.session.commit() def resetDlstate(): @@ -64,20 +90,23 @@ def resetDlstate(): def go(): insertDanbooruStartingPoints() insertGelbooruStartingPoints() + insertR34xxxStartingPoints() resetDlstate() - + # r34xxxScrape.run(0) executor = concurrent.futures.ThreadPoolExecutor(max_workers=THREADS) try: # for x in range(2): # executor.submit(danbooruFetch.run, 0) # executor.submit(gelbooruFetch.run, 0) - for x in range(THREADS//2): - executor.submit(danbooruFetch.run, x) - for x in range(THREADS//2): - executor.submit(gelbooruFetch.run, x) + for x in range(THREADS): + executor.submit(r34xxxScrape.run, x) + # for x in range(THREADS//2): + # executor.submit(danbooruFetch.run, x) + # for x in range(THREADS//2): + # executor.submit(gelbooruFetch.run, x) executor.shutdown() except KeyboardInterrupt: print("Waiting for executor.") diff --git a/r34xxxScrape.py b/r34xxxScrape.py new file mode 100644 index 0000000..1c6971d --- /dev/null +++ b/r34xxxScrape.py @@ -0,0 +1,212 @@ + +import database as db +import webFunctions +import logging +import traceback +import sqlalchemy.exc +import runstate +import urllib.error +import urllib.parse +import re +import parsedatetime +import os +import settings +import os.path +import time +import datetime + +import fetchBase + +import danbooruFetch +class R34xxxFetcher(danbooruFetch.DanbooruFetcher): + + pluginkey = 'Rule34.xxx' + loggerpath = "Main.Rule34-xxx" + + def __init__(self): + self.log = logging.getLogger("Main.Rule34-xxx") + self.wg = webFunctions.WebGetRobust(logPath="Main.Rule34-xxx.Web") + + def extractTags(self, job, tagsection): + + characterlis = tagsection.find_all('li', class_='tag-type-character') + artistlis = tagsection.find_all('li', class_='tag-type-artist') + taglis = tagsection.find_all('li', class_='tag-type-general') + + + tags = [] + for tagli in taglis: + tag = tagli.find('a').get_text() + tags.append(tag) + + artists = [] + for artistli in artistlis: + artist = artistli.find('a').get_text() + artists.append(artist) + + characters = [] + for characterli in characterlis: + character = characterli.find('a').get_text() + characters.append(character) + + for tag in tags: + if tag not in job.tags: + job.tags.append(tag) + for artist in artists: + if artist not in job.artist: + job.artist.append(artist) + for character in characters: + if character not in job.character: + job.character.append(character) + + def getxy(self, instr): + found = re.search(r"\((\d+)x(\d+)\)", instr) + x, y = found.groups() + return x, y + + def extractInfo(self, job, infosection): + imgurl = None + for li in infosection.find_all("li"): + rawt = li.get_text() + name, val = rawt.split(":", 1) + + name = name.strip() + val = val.strip() + + if name == 'Rating': + pass + # job.rating = val + elif name == 'Favorites': + pass + # job.favorites = val + elif name == 'Score': + val = val.strip() + val = val.split()[0] + job.score = val + elif name == 'Posted': + cal = parsedatetime.Calendar() + val = val.split("by")[0] + tstruct, pstat = cal.parse(val) + assert pstat == 1 or pstat == 2 + job.posted = datetime.datetime.fromtimestamp(time.mktime(tstruct)) + elif name == 'Size': + if not '\n' in val: + return False + fsize, res = val.split("\n") + fsize, res = fsize.strip(), res.strip() + job.imgx, job.imgy = self.getxy(res) + + link = li.find("a") + if link: + imgurl = link['href'] + + elif name == 'Status': + pass + # job.status = val + # # Do not try to fetch things that are banned (e.g. removed) + # if val == 'Banned': + # job.dlstate=-2 + elif name in ['Approver', 'Id', 'Source', 'Uploader']: + pass + else: + self.log.warning("Unknown item key-value:") + self.log.warning(" '{}' -> '{}'".format(name, val)) + return imgurl + + def extractMeta(self, job, soup): + tagsection = soup.find('ul', id='tag-sidebar') + assert tagsection + infosection = soup.find('div', id='stats') + assert infosection + self.extractTags(job, tagsection) + imgurl = self.extractInfo(job, infosection) + return imgurl + + + + def fetchImage(self, job, url, srcurl): + url = urllib.parse.urljoin(srcurl, url) + fname = url.split("/")[-1] + + + cont = self.wg.getpage(url, addlHeaders={'Referer':srcurl}) + + fpath = self.saveFile(job, fname, cont) + self.log.info("Saved file to path: '%s'", fpath) + + job.filename = fname + job.filepath = fpath + job.dlstate = 2 + db.session.commit() + # print(fname) + + def processJob(self, job): + pageurl = 'http://rule34.xxx/index.php?page=post&s=view&id={}'.format(job.postid) + try: + soup = self.wg.getSoup(pageurl) + except urllib.error.URLError: + job.dlstate=-1 + db.session.commit() + return + + text = soup.get_text() + if 'You need a gold account to see this image.' in text: + job.dlstate=-3 + db.session.commit() + return + if 'This post was deleted for the following reasons' in text: + job.dlstate=-4 + db.session.commit() + return + if 'Save this flash' in text: + job.dlstate=-9 + db.session.commit() + return + err = 0 + while err < 5: + try: + imgurl = self.extractMeta(job, soup) + if imgurl: + self.fetchImage(job, imgurl, pageurl) + else: + self.log.info("No image found for URL: '%s'", pageurl) + job.dlstate=-5 + break + except AssertionError: + self.log.info("Assertion error?: '%s'", pageurl) + job.dlstate=-50 + db.session.rollback() + break + + except sqlalchemy.exc.IntegrityError: + err += 1 + db.session.rollback() + + + + def retreiveItem(self): + job = self.get_job() + if not job: + return False + + self.processJob(job) + return True + + + + + +def run(indice): + print("Runner {}!".format(indice)) + fetcher = R34xxxFetcher() + remainingTasks = True + + try: + while remainingTasks and runstate.run: + remainingTasks = fetcher.retreiveItem() + except KeyboardInterrupt: + return + except: + print("Unhandled exception!") + traceback.print_exc() + raise