User:GrabberBot/grabber.py
From The Battle for Wesnoth Wiki
#!/usr/bin/python # version 7: works on mediawiki # version 6: no repeated images, no images from signatures # version 5: only use allowed forums # version 4: supports zlib compression # version 3: grabs also inline <img>s, not just attachements # version 2: can run safely from a cron job, without getting stuck at # deleted posts # version 1: the first one import urllib2, re, sys, os, time VERBOSE = 1 WRITE_IMG_TAGS = 0 UNTIL_PID = 0 for i in range(len(sys.argv)): if sys.argv[i] == "-q": VERBOSE = 0 elif sys.argv[i] == "-i": WRITE_IMG_TAGS = 1 elif sys.argv[i] == "-s": UNTIL_PID = int(sys.argv[i + 1]) forum_url = "http://www.wesnoth.org/forum/" topic_url = "http://www.wesnoth.org/forum/viewtopic.php" wiki_url = "http://wesnoth.org/wiki" command_url = "http://allefant.sourceforge.net/wesnoth/graphiclibrary.py" allowed_forums = ["Ideas", "Developer's Discussions", "Art Development", "Scenario & Campaign Development"] ALREADY = {} # the last processed post id is kept in the file pid.txt try: x = file("pid.txt").read().split() last_pid = pid = int(x[0]) if len(x) > 1: max_missing = 2 + int(x[1]) else: max_missing = 2 except IOError: print """You need to create a file called pid.txt, which has the number of the first post to examine. You can find this number by hovering the mouse over the tiny white page image in the header line of each post. From then on, this script will maintain pid.txt itself.""" sys.exit(1) def gethttp(url): http = urllib2.Request(url) http.add_header("Accept-Encoding", "gzip") http.add_header("User-Agent", "grabber bot") try: page = urllib2.urlopen(http) except urllib2.HTTPError: print "Cannot open %s\n" % http sys.exit(1) contents = page.read() if "Content-Encoding" in page.info() and page.info()["Content-Encoding"] == "gzip": stdin, stdout = os.popen2("gunzip") stdin.write(contents) stdin.close() contents = stdout.read() stdout.close() return contents # Now we try to read posts and update pid.txt, until it doesn't work, # and we assume we reached the last post. The next time the program is # run, it will start with the pid that failed again. missing = 0 while 1: if UNTIL_PID == pid: print "Requested stop before pid %d" % pid time.sleep(5) sys.exit(0) if VERBOSE: sys.stderr.write("Examining post %d\n" % pid) html = gethttp("http://www.wesnoth.org/forum/" + "viewtopic.php?p=%s" % pid) mob = re.compile( """<link rel="up" href=".*?" title="(.*?)" />""").search(html) if mob: forum = mob.group(1) if mob and forum in allowed_forums: mob = re.compile("""<span class="name">""" + """<a name="%s"></a><b>(.*?)</b>""" % pid).search(html) else: # assume this was the last post if VERBOSE: if mob: sys.stderr.write(" (posted to %s)\n" % forum) else: sys.stderr.write(" not posted (yet?)\n") if not mob: missing += 1 if missing > max_missing: break mob = None if mob: max_missing = missing = 0 last_pid = pid + 1 if mob and forum in allowed_forums: # the author of the post name = mob.group(1) if VERBOSE: sys.stderr.write(" by %s\n" % name) mob2 = re.compile("""<span class="nav">""" + """<a href="#top" class="nav">Back to top</a></span>""" ).search(html, mob.end(0)) posting = html[mob.end(0):mob2.start(0)] mob2 = re.compile("""Posted:""").search(posting) posting = posting[mob2.end(0):] mob2 = re.compile("_________________").search(posting) if mob2: posting = posting[:mob2.start(0)] # go through all images in the post icount = 0 pos = 0 while 1: mob = re.compile("<img src=\"(.*?)\"").search(posting, pos) if mob: # image image_link = mob.group(1) if not image_link.startswith("templates/") and (not image_link.startswith("images/")): # posts.txt is where all newly discovered image attachements are # attached to, so it can be processed further by another script posts = file("post.txt", "a") if image_link.startswith("files/"): image_link = forum_url + image_link image_link = image_link.replace(" ", "%20") if not image_link in ALREADY: ALREADY[image_link] = 1 post_link = """[%s?p=%d#%d post %d]""" % ( topic_url, pid, pid, pid) name_link = """[%s/%sContrib %s] | """ % ( wiki_url, name.replace(" ", "%20"), name) name_link += """[%s?command=move%%20%s%%20%s Move] | """ % ( command_url, name.replace(" ", "%20"), image_link) name_link += """[%s?command=delete%%20%s X]""" % ( command_url, image_link) if WRITE_IMG_TAGS: image_link = """<img src="%s" />""" % image_link posts.write("* %s | %s | (%s)\n" % (name_link, image_link, post_link)) posts.close() icount += 1 pos = mob.end(0) else: # no more images break if VERBOSE: sys.stderr.write(" with %d images\n" % icount) pid += 1 # update pid.txt file("pid.txt", "w").write(str(last_pid) + " " + str(missing))
This page was last edited on 19 August 2005, at 20:23.