<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
	<id>https://wiki.wesnoth.org/index.php?action=history&amp;feed=atom&amp;title=User%3AGrabberBot%2Fgrabber.py</id>
	<title>User:GrabberBot/grabber.py - Revision history</title>
	<link rel="self" type="application/atom+xml" href="https://wiki.wesnoth.org/index.php?action=history&amp;feed=atom&amp;title=User%3AGrabberBot%2Fgrabber.py"/>
	<link rel="alternate" type="text/html" href="https://wiki.wesnoth.org/index.php?title=User:GrabberBot/grabber.py&amp;action=history"/>
	<updated>2026-04-22T09:35:44Z</updated>
	<subtitle>Revision history for this page on the wiki</subtitle>
	<generator>MediaWiki 1.31.16</generator>
	<entry>
		<id>https://wiki.wesnoth.org/index.php?title=User:GrabberBot/grabber.py&amp;diff=1998&amp;oldid=prev</id>
		<title>Allefant: pasted code</title>
		<link rel="alternate" type="text/html" href="https://wiki.wesnoth.org/index.php?title=User:GrabberBot/grabber.py&amp;diff=1998&amp;oldid=prev"/>
		<updated>2005-08-19T20:23:12Z</updated>

		<summary type="html">&lt;p&gt;pasted code&lt;/p&gt;
&lt;p&gt;&lt;b&gt;New page&lt;/b&gt;&lt;/p&gt;&lt;div&gt;&amp;lt;pre&amp;gt;&lt;br /&gt;
#!/usr/bin/python&lt;br /&gt;
&lt;br /&gt;
# version 7: works on mediawiki&lt;br /&gt;
# version 6: no repeated images, no images from signatures&lt;br /&gt;
# version 5: only use allowed forums&lt;br /&gt;
# version 4: supports zlib compression&lt;br /&gt;
# version 3: grabs also inline &amp;lt;img&amp;gt;s, not just attachements&lt;br /&gt;
# version 2: can run safely from a cron job, without getting stuck at&lt;br /&gt;
#            deleted posts&lt;br /&gt;
# version 1: the first one&lt;br /&gt;
import urllib2, re, sys, os, time&lt;br /&gt;
&lt;br /&gt;
VERBOSE = 1&lt;br /&gt;
WRITE_IMG_TAGS = 0&lt;br /&gt;
UNTIL_PID = 0&lt;br /&gt;
for i in range(len(sys.argv)):&lt;br /&gt;
    if sys.argv[i] == &amp;quot;-q&amp;quot;:&lt;br /&gt;
        VERBOSE = 0&lt;br /&gt;
    elif sys.argv[i] == &amp;quot;-i&amp;quot;:&lt;br /&gt;
        WRITE_IMG_TAGS = 1&lt;br /&gt;
    elif sys.argv[i] == &amp;quot;-s&amp;quot;:&lt;br /&gt;
        UNTIL_PID = int(sys.argv[i + 1])&lt;br /&gt;
&lt;br /&gt;
forum_url = &amp;quot;http://www.wesnoth.org/forum/&amp;quot;&lt;br /&gt;
topic_url = &amp;quot;http://www.wesnoth.org/forum/viewtopic.php&amp;quot;&lt;br /&gt;
wiki_url = &amp;quot;http://wesnoth.org/wiki&amp;quot;&lt;br /&gt;
command_url = &amp;quot;http://allefant.sourceforge.net/wesnoth/graphiclibrary.py&amp;quot;&lt;br /&gt;
&lt;br /&gt;
allowed_forums = [&amp;quot;Ideas&amp;quot;, &amp;quot;Developer's Discussions&amp;quot;, &amp;quot;Art Development&amp;quot;,&lt;br /&gt;
    &amp;quot;Scenario &amp;amp; Campaign Development&amp;quot;]&lt;br /&gt;
&lt;br /&gt;
ALREADY = {}&lt;br /&gt;
&lt;br /&gt;
# the last processed post id is kept in the file pid.txt&lt;br /&gt;
try:&lt;br /&gt;
    x = file(&amp;quot;pid.txt&amp;quot;).read().split()&lt;br /&gt;
    last_pid = pid = int(x[0])&lt;br /&gt;
    if len(x) &amp;gt; 1:&lt;br /&gt;
        max_missing = 2 + int(x[1])&lt;br /&gt;
    else:&lt;br /&gt;
        max_missing = 2&lt;br /&gt;
except IOError:&lt;br /&gt;
    print &amp;quot;&amp;quot;&amp;quot;You need to create a file called pid.txt, which has the&lt;br /&gt;
number of the first post to examine. You can find this number by&lt;br /&gt;
hovering the mouse over the tiny white page image in the header line&lt;br /&gt;
of each post. From then on, this script will maintain pid.txt itself.&amp;quot;&amp;quot;&amp;quot;&lt;br /&gt;
    sys.exit(1)&lt;br /&gt;
&lt;br /&gt;
def gethttp(url):&lt;br /&gt;
    http = urllib2.Request(url)&lt;br /&gt;
    http.add_header(&amp;quot;Accept-Encoding&amp;quot;, &amp;quot;gzip&amp;quot;)&lt;br /&gt;
    http.add_header(&amp;quot;User-Agent&amp;quot;, &amp;quot;grabber bot&amp;quot;)&lt;br /&gt;
    try:&lt;br /&gt;
        page = urllib2.urlopen(http)&lt;br /&gt;
    except urllib2.HTTPError:&lt;br /&gt;
        print &amp;quot;Cannot open %s\n&amp;quot; % http&lt;br /&gt;
        sys.exit(1)&lt;br /&gt;
    contents = page.read()&lt;br /&gt;
    if &amp;quot;Content-Encoding&amp;quot; in page.info() and page.info()[&amp;quot;Content-Encoding&amp;quot;] == &amp;quot;gzip&amp;quot;:&lt;br /&gt;
        stdin, stdout = os.popen2(&amp;quot;gunzip&amp;quot;)&lt;br /&gt;
        stdin.write(contents)&lt;br /&gt;
        stdin.close()&lt;br /&gt;
        contents = stdout.read()&lt;br /&gt;
        stdout.close()&lt;br /&gt;
    return contents&lt;br /&gt;
&lt;br /&gt;
# Now we try to read posts and update pid.txt, until it doesn't work,&lt;br /&gt;
# and we assume we reached the last post. The next time the program is&lt;br /&gt;
# run, it will start with the pid that failed again.&lt;br /&gt;
missing = 0&lt;br /&gt;
while 1:&lt;br /&gt;
    if UNTIL_PID == pid:&lt;br /&gt;
        print &amp;quot;Requested stop before pid %d&amp;quot; % pid&lt;br /&gt;
        time.sleep(5)&lt;br /&gt;
        sys.exit(0)&lt;br /&gt;
&lt;br /&gt;
    if VERBOSE:&lt;br /&gt;
        sys.stderr.write(&amp;quot;Examining post %d\n&amp;quot; % pid)&lt;br /&gt;
    html = gethttp(&amp;quot;http://www.wesnoth.org/forum/&amp;quot; +&lt;br /&gt;
        &amp;quot;viewtopic.php?p=%s&amp;quot; % pid)&lt;br /&gt;
&lt;br /&gt;
    mob = re.compile(&lt;br /&gt;
        &amp;quot;&amp;quot;&amp;quot;&amp;lt;link rel=&amp;quot;up&amp;quot; href=&amp;quot;.*?&amp;quot; title=&amp;quot;(.*?)&amp;quot; /&amp;gt;&amp;quot;&amp;quot;&amp;quot;).search(html)&lt;br /&gt;
    if mob: forum = mob.group(1)&lt;br /&gt;
    if mob and forum in allowed_forums:&lt;br /&gt;
        mob = re.compile(&amp;quot;&amp;quot;&amp;quot;&amp;lt;span class=&amp;quot;name&amp;quot;&amp;gt;&amp;quot;&amp;quot;&amp;quot; +&lt;br /&gt;
            &amp;quot;&amp;quot;&amp;quot;&amp;lt;a name=&amp;quot;%s&amp;quot;&amp;gt;&amp;lt;/a&amp;gt;&amp;lt;b&amp;gt;(.*?)&amp;lt;/b&amp;gt;&amp;quot;&amp;quot;&amp;quot; % pid).search(html)&lt;br /&gt;
    else:&lt;br /&gt;
        # assume this was the last post&lt;br /&gt;
        if VERBOSE:&lt;br /&gt;
            if mob:&lt;br /&gt;
                sys.stderr.write(&amp;quot; (posted to %s)\n&amp;quot; % forum)&lt;br /&gt;
            else:&lt;br /&gt;
                sys.stderr.write(&amp;quot; not posted (yet?)\n&amp;quot;)&lt;br /&gt;
        if not mob:&lt;br /&gt;
            missing += 1&lt;br /&gt;
            if missing &amp;gt; max_missing:&lt;br /&gt;
                break&lt;br /&gt;
            mob = None&lt;br /&gt;
&lt;br /&gt;
    if mob:&lt;br /&gt;
        max_missing = missing = 0&lt;br /&gt;
        last_pid = pid + 1&lt;br /&gt;
&lt;br /&gt;
    if mob and forum in allowed_forums:&lt;br /&gt;
        # the author of the post&lt;br /&gt;
        name = mob.group(1)&lt;br /&gt;
&lt;br /&gt;
        if VERBOSE:&lt;br /&gt;
            sys.stderr.write(&amp;quot; by %s\n&amp;quot; % name)&lt;br /&gt;
&lt;br /&gt;
        mob2 = re.compile(&amp;quot;&amp;quot;&amp;quot;&amp;lt;span class=&amp;quot;nav&amp;quot;&amp;gt;&amp;quot;&amp;quot;&amp;quot; +&lt;br /&gt;
            &amp;quot;&amp;quot;&amp;quot;&amp;lt;a href=&amp;quot;#top&amp;quot; class=&amp;quot;nav&amp;quot;&amp;gt;Back to top&amp;lt;/a&amp;gt;&amp;lt;/span&amp;gt;&amp;quot;&amp;quot;&amp;quot;&lt;br /&gt;
            ).search(html, mob.end(0))&lt;br /&gt;
        posting = html[mob.end(0):mob2.start(0)]&lt;br /&gt;
        mob2 = re.compile(&amp;quot;&amp;quot;&amp;quot;Posted:&amp;quot;&amp;quot;&amp;quot;).search(posting)&lt;br /&gt;
        posting = posting[mob2.end(0):]&lt;br /&gt;
&lt;br /&gt;
        mob2 = re.compile(&amp;quot;_________________&amp;quot;).search(posting)&lt;br /&gt;
        if mob2:&lt;br /&gt;
            posting = posting[:mob2.start(0)]&lt;br /&gt;
&lt;br /&gt;
        # go through all images in the post&lt;br /&gt;
        icount = 0&lt;br /&gt;
        pos = 0&lt;br /&gt;
        while 1:&lt;br /&gt;
            mob = re.compile(&amp;quot;&amp;lt;img src=\&amp;quot;(.*?)\&amp;quot;&amp;quot;).search(posting, pos)&lt;br /&gt;
            if mob: # image&lt;br /&gt;
                image_link = mob.group(1)&lt;br /&gt;
                if not image_link.startswith(&amp;quot;templates/&amp;quot;) and (not&lt;br /&gt;
                    image_link.startswith(&amp;quot;images/&amp;quot;)):&lt;br /&gt;
                    # posts.txt is where all newly discovered image attachements are&lt;br /&gt;
                    # attached to, so it can be processed further by another script&lt;br /&gt;
                    posts = file(&amp;quot;post.txt&amp;quot;, &amp;quot;a&amp;quot;)&lt;br /&gt;
                    if image_link.startswith(&amp;quot;files/&amp;quot;):&lt;br /&gt;
                        image_link = forum_url + image_link&lt;br /&gt;
&lt;br /&gt;
                    image_link = image_link.replace(&amp;quot; &amp;quot;, &amp;quot;%20&amp;quot;)&lt;br /&gt;
&lt;br /&gt;
                    if not image_link in ALREADY:&lt;br /&gt;
                        ALREADY[image_link] = 1&lt;br /&gt;
&lt;br /&gt;
                        post_link = &amp;quot;&amp;quot;&amp;quot;[%s?p=%d#%d post %d]&amp;quot;&amp;quot;&amp;quot; % (&lt;br /&gt;
                            topic_url, pid, pid, pid)&lt;br /&gt;
                        name_link = &amp;quot;&amp;quot;&amp;quot;[%s/%sContrib %s] | &amp;quot;&amp;quot;&amp;quot; % (&lt;br /&gt;
                            wiki_url, name.replace(&amp;quot; &amp;quot;, &amp;quot;%20&amp;quot;), name)&lt;br /&gt;
                        name_link += &amp;quot;&amp;quot;&amp;quot;[%s?command=move%%20%s%%20%s Move] | &amp;quot;&amp;quot;&amp;quot; % (&lt;br /&gt;
                            command_url, name.replace(&amp;quot; &amp;quot;, &amp;quot;%20&amp;quot;), image_link)&lt;br /&gt;
                        name_link += &amp;quot;&amp;quot;&amp;quot;[%s?command=delete%%20%s X]&amp;quot;&amp;quot;&amp;quot; % (&lt;br /&gt;
                            command_url, image_link)&lt;br /&gt;
&lt;br /&gt;
                        if WRITE_IMG_TAGS:&lt;br /&gt;
                            image_link = &amp;quot;&amp;quot;&amp;quot;&amp;lt;img src=&amp;quot;%s&amp;quot; /&amp;gt;&amp;quot;&amp;quot;&amp;quot; % image_link&lt;br /&gt;
&lt;br /&gt;
                        posts.write(&amp;quot;* %s | %s | (%s)\n&amp;quot; % (name_link, image_link, post_link))&lt;br /&gt;
                        posts.close()&lt;br /&gt;
                        icount += 1&lt;br /&gt;
                pos = mob.end(0)&lt;br /&gt;
            else: # no more images&lt;br /&gt;
                break&lt;br /&gt;
&lt;br /&gt;
        if VERBOSE:&lt;br /&gt;
            sys.stderr.write(&amp;quot; with %d images\n&amp;quot; % icount)&lt;br /&gt;
&lt;br /&gt;
    pid += 1&lt;br /&gt;
&lt;br /&gt;
    # update pid.txt&lt;br /&gt;
    file(&amp;quot;pid.txt&amp;quot;, &amp;quot;w&amp;quot;).write(str(last_pid) + &amp;quot; &amp;quot; + str(missing))&lt;br /&gt;
&amp;lt;/pre&amp;gt;&lt;/div&gt;</summary>
		<author><name>Allefant</name></author>
		
	</entry>
</feed>