MOAR SITEZ

This commit is contained in:
Fake-Name 2015-08-31 19:37:56 -07:00
parent af85fd8089
commit 1aa23060c6
4 changed files with 253 additions and 11 deletions

View File

@ -43,11 +43,12 @@ class AbstractFetcher(object, metaclass=abc.ABCMeta):
while 1:
try:
job = db.session.query(db.Releases) \
.filter(db.Releases.dlstate == 0) \
.filter(db.Releases.source == self.pluginkey) \
.filter(db.Releases.dlstate == 0) \
.order_by(db.Releases.postid) \
.limit(1) \
.one()
.limit(1)
job = job.scalar()
if job == None:
return None
job.dlstate = 1

View File

@ -206,7 +206,7 @@ class GelbooruFetcher(object):
soup = self.wg.getSoup(pageurl)
if 'You are viewing an advertisement' in soup.get_text():
self.log.warning("Working around advertisement. Sleeping 10 seconds")
time.sleep(10)
time.sleep(13)
else:
break
except urllib.error.URLError:

43
main.py
View File

@ -6,6 +6,7 @@ logSetup.initLogging()
import danbooruFetch
import gelbooruFetch
import r34xxxScrape
import runstate
import concurrent.futures
@ -16,8 +17,9 @@ def insertDanbooruStartingPoints():
tmp = db.session.query(db.Releases) \
.filter(db.Releases.postid == 1) \
.filter(db.Releases.source == 'Danbooru') \
.count()
.filter(db.Releases.source == 'Danbooru')
tmp = tmp.count()
if not tmp:
for x in range(2070000):
new = db.Releases(dlstate=0, postid=x, source='Danbooru')
@ -52,6 +54,30 @@ def insertGelbooruStartingPoints():
# db.session.flush()
# print("Flushed.")
db.session.commit()
def insertR34xxxStartingPoints():
tmp = db.session.query(db.Releases) \
.filter(db.Releases.postid == 1) \
.filter(db.Releases.source == 'Rule34.xxx') \
.count()
if not tmp:
print("Building insert data structure")
dat = [{"dlstate" : 0, "postid" : x, "source" : 'Rule34.xxx'} for x in range(1844200)]
print("Building insert query")
q = db.Releases.__table__.insert().values(dat)
print("Built. Doing insert.")
db.engine.execute(q)
print("Done.")
# for x in :
# new = db.Releases(dlstate=0, postid=x, source='Gelbooru')
# # db.session.add(new)
# if x % 100000 == 0:
# print("Loop ", x, "flushing...")
# db.session.flush()
# print("Flushed.")
db.session.commit()
def resetDlstate():
@ -64,20 +90,23 @@ def resetDlstate():
def go():
insertDanbooruStartingPoints()
insertGelbooruStartingPoints()
insertR34xxxStartingPoints()
resetDlstate()
# r34xxxScrape.run(0)
executor = concurrent.futures.ThreadPoolExecutor(max_workers=THREADS)
try:
# for x in range(2):
# executor.submit(danbooruFetch.run, 0)
# executor.submit(gelbooruFetch.run, 0)
for x in range(THREADS//2):
executor.submit(danbooruFetch.run, x)
for x in range(THREADS//2):
executor.submit(gelbooruFetch.run, x)
for x in range(THREADS):
executor.submit(r34xxxScrape.run, x)
# for x in range(THREADS//2):
# executor.submit(danbooruFetch.run, x)
# for x in range(THREADS//2):
# executor.submit(gelbooruFetch.run, x)
executor.shutdown()
except KeyboardInterrupt:
print("Waiting for executor.")

212
r34xxxScrape.py Normal file
View File

@ -0,0 +1,212 @@
import database as db
import webFunctions
import logging
import traceback
import sqlalchemy.exc
import runstate
import urllib.error
import urllib.parse
import re
import parsedatetime
import os
import settings
import os.path
import time
import datetime
import fetchBase
import danbooruFetch
class R34xxxFetcher(danbooruFetch.DanbooruFetcher):
pluginkey = 'Rule34.xxx'
loggerpath = "Main.Rule34-xxx"
def __init__(self):
self.log = logging.getLogger("Main.Rule34-xxx")
self.wg = webFunctions.WebGetRobust(logPath="Main.Rule34-xxx.Web")
def extractTags(self, job, tagsection):
characterlis = tagsection.find_all('li', class_='tag-type-character')
artistlis = tagsection.find_all('li', class_='tag-type-artist')
taglis = tagsection.find_all('li', class_='tag-type-general')
tags = []
for tagli in taglis:
tag = tagli.find('a').get_text()
tags.append(tag)
artists = []
for artistli in artistlis:
artist = artistli.find('a').get_text()
artists.append(artist)
characters = []
for characterli in characterlis:
character = characterli.find('a').get_text()
characters.append(character)
for tag in tags:
if tag not in job.tags:
job.tags.append(tag)
for artist in artists:
if artist not in job.artist:
job.artist.append(artist)
for character in characters:
if character not in job.character:
job.character.append(character)
def getxy(self, instr):
found = re.search(r"\((\d+)x(\d+)\)", instr)
x, y = found.groups()
return x, y
def extractInfo(self, job, infosection):
imgurl = None
for li in infosection.find_all("li"):
rawt = li.get_text()
name, val = rawt.split(":", 1)
name = name.strip()
val = val.strip()
if name == 'Rating':
pass
# job.rating = val
elif name == 'Favorites':
pass
# job.favorites = val
elif name == 'Score':
val = val.strip()
val = val.split()[0]
job.score = val
elif name == 'Posted':
cal = parsedatetime.Calendar()
val = val.split("by")[0]
tstruct, pstat = cal.parse(val)
assert pstat == 1 or pstat == 2
job.posted = datetime.datetime.fromtimestamp(time.mktime(tstruct))
elif name == 'Size':
if not '\n' in val:
return False
fsize, res = val.split("\n")
fsize, res = fsize.strip(), res.strip()
job.imgx, job.imgy = self.getxy(res)
link = li.find("a")
if link:
imgurl = link['href']
elif name == 'Status':
pass
# job.status = val
# # Do not try to fetch things that are banned (e.g. removed)
# if val == 'Banned':
# job.dlstate=-2
elif name in ['Approver', 'Id', 'Source', 'Uploader']:
pass
else:
self.log.warning("Unknown item key-value:")
self.log.warning(" '{}' -> '{}'".format(name, val))
return imgurl
def extractMeta(self, job, soup):
tagsection = soup.find('ul', id='tag-sidebar')
assert tagsection
infosection = soup.find('div', id='stats')
assert infosection
self.extractTags(job, tagsection)
imgurl = self.extractInfo(job, infosection)
return imgurl
def fetchImage(self, job, url, srcurl):
url = urllib.parse.urljoin(srcurl, url)
fname = url.split("/")[-1]
cont = self.wg.getpage(url, addlHeaders={'Referer':srcurl})
fpath = self.saveFile(job, fname, cont)
self.log.info("Saved file to path: '%s'", fpath)
job.filename = fname
job.filepath = fpath
job.dlstate = 2
db.session.commit()
# print(fname)
def processJob(self, job):
pageurl = 'http://rule34.xxx/index.php?page=post&s=view&id={}'.format(job.postid)
try:
soup = self.wg.getSoup(pageurl)
except urllib.error.URLError:
job.dlstate=-1
db.session.commit()
return
text = soup.get_text()
if 'You need a gold account to see this image.' in text:
job.dlstate=-3
db.session.commit()
return
if 'This post was deleted for the following reasons' in text:
job.dlstate=-4
db.session.commit()
return
if 'Save this flash' in text:
job.dlstate=-9
db.session.commit()
return
err = 0
while err < 5:
try:
imgurl = self.extractMeta(job, soup)
if imgurl:
self.fetchImage(job, imgurl, pageurl)
else:
self.log.info("No image found for URL: '%s'", pageurl)
job.dlstate=-5
break
except AssertionError:
self.log.info("Assertion error?: '%s'", pageurl)
job.dlstate=-50
db.session.rollback()
break
except sqlalchemy.exc.IntegrityError:
err += 1
db.session.rollback()
def retreiveItem(self):
job = self.get_job()
if not job:
return False
self.processJob(job)
return True
def run(indice):
print("Runner {}!".format(indice))
fetcher = R34xxxFetcher()
remainingTasks = True
try:
while remainingTasks and runstate.run:
remainingTasks = fetcher.retreiveItem()
except KeyboardInterrupt:
return
except:
print("Unhandled exception!")
traceback.print_exc()
raise