User:GrabberBot/grabber.py
From The Battle for Wesnoth Wiki
#!/usr/bin/python
# version 7: works on mediawiki
# version 6: no repeated images, no images from signatures
# version 5: only use allowed forums
# version 4: supports zlib compression
# version 3: grabs also inline <img>s, not just attachements
# version 2: can run safely from a cron job, without getting stuck at
# deleted posts
# version 1: the first one
import urllib2, re, sys, os, time
VERBOSE = 1
WRITE_IMG_TAGS = 0
UNTIL_PID = 0
for i in range(len(sys.argv)):
if sys.argv[i] == "-q":
VERBOSE = 0
elif sys.argv[i] == "-i":
WRITE_IMG_TAGS = 1
elif sys.argv[i] == "-s":
UNTIL_PID = int(sys.argv[i + 1])
forum_url = "http://www.wesnoth.org/forum/"
topic_url = "http://www.wesnoth.org/forum/viewtopic.php"
wiki_url = "http://wesnoth.org/wiki"
command_url = "http://allefant.sourceforge.net/wesnoth/graphiclibrary.py"
allowed_forums = ["Ideas", "Developer's Discussions", "Art Development",
"Scenario & Campaign Development"]
ALREADY = {}
# the last processed post id is kept in the file pid.txt
try:
x = file("pid.txt").read().split()
last_pid = pid = int(x[0])
if len(x) > 1:
max_missing = 2 + int(x[1])
else:
max_missing = 2
except IOError:
print """You need to create a file called pid.txt, which has the
number of the first post to examine. You can find this number by
hovering the mouse over the tiny white page image in the header line
of each post. From then on, this script will maintain pid.txt itself."""
sys.exit(1)
def gethttp(url):
http = urllib2.Request(url)
http.add_header("Accept-Encoding", "gzip")
http.add_header("User-Agent", "grabber bot")
try:
page = urllib2.urlopen(http)
except urllib2.HTTPError:
print "Cannot open %s\n" % http
sys.exit(1)
contents = page.read()
if "Content-Encoding" in page.info() and page.info()["Content-Encoding"] == "gzip":
stdin, stdout = os.popen2("gunzip")
stdin.write(contents)
stdin.close()
contents = stdout.read()
stdout.close()
return contents
# Now we try to read posts and update pid.txt, until it doesn't work,
# and we assume we reached the last post. The next time the program is
# run, it will start with the pid that failed again.
missing = 0
while 1:
if UNTIL_PID == pid:
print "Requested stop before pid %d" % pid
time.sleep(5)
sys.exit(0)
if VERBOSE:
sys.stderr.write("Examining post %d\n" % pid)
html = gethttp("http://www.wesnoth.org/forum/" +
"viewtopic.php?p=%s" % pid)
mob = re.compile(
"""<link rel="up" href=".*?" title="(.*?)" />""").search(html)
if mob: forum = mob.group(1)
if mob and forum in allowed_forums:
mob = re.compile("""<span class="name">""" +
"""<a name="%s"></a><b>(.*?)</b>""" % pid).search(html)
else:
# assume this was the last post
if VERBOSE:
if mob:
sys.stderr.write(" (posted to %s)\n" % forum)
else:
sys.stderr.write(" not posted (yet?)\n")
if not mob:
missing += 1
if missing > max_missing:
break
mob = None
if mob:
max_missing = missing = 0
last_pid = pid + 1
if mob and forum in allowed_forums:
# the author of the post
name = mob.group(1)
if VERBOSE:
sys.stderr.write(" by %s\n" % name)
mob2 = re.compile("""<span class="nav">""" +
"""<a href="#top" class="nav">Back to top</a></span>"""
).search(html, mob.end(0))
posting = html[mob.end(0):mob2.start(0)]
mob2 = re.compile("""Posted:""").search(posting)
posting = posting[mob2.end(0):]
mob2 = re.compile("_________________").search(posting)
if mob2:
posting = posting[:mob2.start(0)]
# go through all images in the post
icount = 0
pos = 0
while 1:
mob = re.compile("<img src=\"(.*?)\"").search(posting, pos)
if mob: # image
image_link = mob.group(1)
if not image_link.startswith("templates/") and (not
image_link.startswith("images/")):
# posts.txt is where all newly discovered image attachements are
# attached to, so it can be processed further by another script
posts = file("post.txt", "a")
if image_link.startswith("files/"):
image_link = forum_url + image_link
image_link = image_link.replace(" ", "%20")
if not image_link in ALREADY:
ALREADY[image_link] = 1
post_link = """[%s?p=%d#%d post %d]""" % (
topic_url, pid, pid, pid)
name_link = """[%s/%sContrib %s] | """ % (
wiki_url, name.replace(" ", "%20"), name)
name_link += """[%s?command=move%%20%s%%20%s Move] | """ % (
command_url, name.replace(" ", "%20"), image_link)
name_link += """[%s?command=delete%%20%s X]""" % (
command_url, image_link)
if WRITE_IMG_TAGS:
image_link = """<img src="%s" />""" % image_link
posts.write("* %s | %s | (%s)\n" % (name_link, image_link, post_link))
posts.close()
icount += 1
pos = mob.end(0)
else: # no more images
break
if VERBOSE:
sys.stderr.write(" with %d images\n" % icount)
pid += 1
# update pid.txt
file("pid.txt", "w").write(str(last_pid) + " " + str(missing))
This page was last edited on 19 August 2005, at 20:23.