User:PMBot/Code
Jump to navigation
Jump to search
Following is the preliminary source code. It reads topic page nameas from Proteopedia:Topic Pages and outputs a list of what would be written when it would have been the first pass. Only topic pages are read, no structures are changed.
# -*- coding: utf-8 -*- from __future__ import with_statement # This isn't required in Python 2.6 """ pmbot [OPTIONS] Goes through all topic pages looking for the usage of non-uploaded structure pages (official PDBs) in scenes. Each of these PDB pages is edited such that it contains in the section named "About this Structure" the string <!-- PMBot Start --> "The page TOPICPAGE refers to 1ABC." or "The pages TOPICPAGES refer to 1ABC." <!-- PMBot End --> where TOPICPAGE is a topic page link, TOPICPAGES is a comma-separated list of topic page links, and 1ABC is the name of the respective structure page. If such a string exists, it is actualized. Options: At the moment, there are no options. """ # # (C) R Stephan 2009 # # Distributed under the terms of the GPL2. # __version__ = '0.10' # import wikipedia,re,sys,config import catlib,traceback,itertools wikipedia.get_throttle.setDelay(5) #wikipedia.put_throttle.setDelay(10) msg={ 'en': 'pmbot: maintenance of structure references', } def main(): Rco = re.compile (u'<!--(?:.(?<!--))*-->') Rt1 = re.compile (u'(?<=\[\[)[^\]]+(?=\]\])') Rt2 = re.compile (u' *\|.*') Rst = re.compile (u'(?<=STRUCTURE_)[1-9][0-9a-zA-Z][0-9a-zA-Z][0-9a-zA-Z]') Rap = re.compile (u'(?<=<applet load=.)[1-9][0-9a-zA-Z][0-9a-zA-Z][0-9a-zA-Z](?=[\'\"])') Rta = re.compile (u"Start of topic pages.*End of topic pages. Please DON'T REMOVE -->", re.DOTALL) site = wikipedia.getSite() dic = {} # Try to read a topic pages list. pagename = 'Proteopedia:Topic_Pages' alltopics = wikipedia.Page (site, pagename) try: temp_text = alltopics.get (False, True) except wikipedia.NoPage: print 'NoPage exception when trying to read topic page list' return # with codecs.open('Topic_Pages.txt', encoding='utf-8') as f: temp_text = f.read() m = Rta.search(temp_text) if m == None: print 'Topic list markers not found.' return alltopics_text = Rco.sub (u'', m.group(0)) topicsIter = Rt1.finditer (alltopics_text) c = 0 for topicmatch in topicsIter: # if c>2: break c = c+1 t = topicmatch.group(0) topicname = Rt2.sub (u'', t) # TODO: check if already loaded before loaded = False; while not loaded: sys.stdout.flush() print 'Retrieving ' + topicname.encode ('ascii', 'xmlcharrefreplace') sys.stdout.flush() topic = wikipedia.Page (site, topicname) try: loaded = True topic_text = topic.get() except wikipedia.NoPage: print 'NoPage exception when trying to read ' + topicname.encode ('ascii', 'xmlcharrefreplace') loaded = False break except wikipedia.SectionError: print 'Subject does not exist: ' + topicname.encode ('ascii', 'xmlcharrefreplace') topicname = re.sub (ur"#.*", '', topicname) loaded = False continue except wikipedia.IsRedirectPage, inst: topicname = inst.args[0] print 'Redirected to ' + topicname.encode ('ascii', 'xmlcharrefreplace') loaded = False continue # print topic_text.encode('utf-8') if not loaded: continue links = itertools.chain ( Rt1.finditer (topic_text), Rst.finditer (topic_text), Rap.finditer (topic_text)) for linkmatch in links: l = linkmatch.group(0) linkname = Rt2.sub ('', l) if linkname[0]>'0' and linkname[0]<='9': if linkname in dic: s = dic[string.lower(linkname)] else: s = set() s.add (topicname.encode('ascii', 'xmlcharrefreplace')) dic[string.lower(linkname)] = s print 'Number of topics read: ', c print 'Number of structures to read/write: ', len(dic) sys.stdout.flush() print dic if __name__ == '__main__': for arg in wikipedia.handleArgs(): # - TODO: flag to switch from applet to scene backlinks to link backlinks # - TODO: add option to search scene files # - TODO: option to restrict number of topics read (c) # if arg.startswith("-p:"): # if (len(arg)) == len("-p:"): # pred = u"refers to" # else: # pred = arg[len("-p:"):] try: main() except: print 'Something wrong.' traceback.print_exc() finally: print 'Stop.' wikipedia.stopme()