User:GrabberBot/grabber.py

#!/usr/bin/python

# version 7: works on mediawiki
# version 6: no repeated images, no images from signatures
# version 5: only use allowed forums
# version 4: supports zlib compression
# version 3: grabs also inline <img>s, not just attachements
# version 2: can run safely from a cron job, without getting stuck at
#            deleted posts
# version 1: the first one
import urllib2, re, sys, os, time

VERBOSE = 1
WRITE_IMG_TAGS = 0
UNTIL_PID = 0
for i in range(len(sys.argv)):
    if sys.argv[i] == "-q":
        VERBOSE = 0
    elif sys.argv[i] == "-i":
        WRITE_IMG_TAGS = 1
    elif sys.argv[i] == "-s":
        UNTIL_PID = int(sys.argv[i + 1])

forum_url = "http://www.wesnoth.org/forum/"
topic_url = "http://www.wesnoth.org/forum/viewtopic.php"
wiki_url = "http://wesnoth.org/wiki"
command_url = "http://allefant.sourceforge.net/wesnoth/graphiclibrary.py"

allowed_forums = ["Ideas", "Developer's Discussions", "Art Development",
    "Scenario & Campaign Development"]

ALREADY = {}

# the last processed post id is kept in the file pid.txt
try:
    x = file("pid.txt").read().split()
    last_pid = pid = int(x[0])
    if len(x) > 1:
        max_missing = 2 + int(x[1])
    else:
        max_missing = 2
except IOError:
    print """You need to create a file called pid.txt, which has the
number of the first post to examine. You can find this number by
hovering the mouse over the tiny white page image in the header line
of each post. From then on, this script will maintain pid.txt itself."""
    sys.exit(1)

def gethttp(url):
    http = urllib2.Request(url)
    http.add_header("Accept-Encoding", "gzip")
    http.add_header("User-Agent", "grabber bot")
    try:
        page = urllib2.urlopen(http)
    except urllib2.HTTPError:
        print "Cannot open %s\n" % http
        sys.exit(1)
    contents = page.read()
    if "Content-Encoding" in page.info() and page.info()["Content-Encoding"] == "gzip":
        stdin, stdout = os.popen2("gunzip")
        stdin.write(contents)
        stdin.close()
        contents = stdout.read()
        stdout.close()
    return contents

# Now we try to read posts and update pid.txt, until it doesn't work,
# and we assume we reached the last post. The next time the program is
# run, it will start with the pid that failed again.
missing = 0
while 1:
    if UNTIL_PID == pid:
        print "Requested stop before pid %d" % pid
        time.sleep(5)
        sys.exit(0)

    if VERBOSE:
        sys.stderr.write("Examining post %d\n" % pid)
    html = gethttp("http://www.wesnoth.org/forum/" +
        "viewtopic.php?p=%s" % pid)

    mob = re.compile(
        """<link rel="up" href=".*?" title="(.*?)" />""").search(html)
    if mob: forum = mob.group(1)
    if mob and forum in allowed_forums:
        mob = re.compile("""<span class="name">""" +
            """<a name="%s"></a><b>(.*?)</b>""" % pid).search(html)
    else:
        # assume this was the last post
        if VERBOSE:
            if mob:
                sys.stderr.write(" (posted to %s)\n" % forum)
            else:
                sys.stderr.write(" not posted (yet?)\n")
        if not mob:
            missing += 1
            if missing > max_missing:
                break
            mob = None

    if mob:
        max_missing = missing = 0
        last_pid = pid + 1

    if mob and forum in allowed_forums:
        # the author of the post
        name = mob.group(1)

        if VERBOSE:
            sys.stderr.write(" by %s\n" % name)

        mob2 = re.compile("""<span class="nav">""" +
            """<a href="#top" class="nav">Back to top</a></span>"""
            ).search(html, mob.end(0))
        posting = html[mob.end(0):mob2.start(0)]
        mob2 = re.compile("""Posted:""").search(posting)
        posting = posting[mob2.end(0):]

        mob2 = re.compile("_________________").search(posting)
        if mob2:
            posting = posting[:mob2.start(0)]

        # go through all images in the post
        icount = 0
        pos = 0
        while 1:
            mob = re.compile("<img src=\"(.*?)\"").search(posting, pos)
            if mob: # image
                image_link = mob.group(1)
                if not image_link.startswith("templates/") and (not
                    image_link.startswith("images/")):
                    # posts.txt is where all newly discovered image attachements are
                    # attached to, so it can be processed further by another script
                    posts = file("post.txt", "a")
                    if image_link.startswith("files/"):
                        image_link = forum_url + image_link

                    image_link = image_link.replace(" ", "%20")

                    if not image_link in ALREADY:
                        ALREADY[image_link] = 1

                        post_link = """[%s?p=%d#%d post %d]""" % (
                            topic_url, pid, pid, pid)
                        name_link = """[%s/%sContrib %s] | """ % (
                            wiki_url, name.replace(" ", "%20"), name)
                        name_link += """[%s?command=move%%20%s%%20%s Move] | """ % (
                            command_url, name.replace(" ", "%20"), image_link)
                        name_link += """[%s?command=delete%%20%s X]""" % (
                            command_url, image_link)

                        if WRITE_IMG_TAGS:
                            image_link = """<img src="%s" />""" % image_link

                        posts.write("* %s | %s | (%s)\n" % (name_link, image_link, post_link))
                        posts.close()
                        icount += 1
                pos = mob.end(0)
            else: # no more images
                break

        if VERBOSE:
            sys.stderr.write(" with %d images\n" % icount)

    pid += 1

    # update pid.txt
    file("pid.txt", "w").write(str(last_pid) + " " + str(missing))