User:PMBot/Code

From Proteopedia
Jump to navigation Jump to search

Following is the preliminary source code. It reads topic page nameas from Proteopedia:Topic Pages and outputs a list of what would be written when it would have been the first pass. Only topic pages are read, no structures are changed.

# -*- coding: utf-8 -*-
from __future__ import with_statement # This isn't required in Python 2.6
"""
pmbot [OPTIONS]
Goes through all topic pages looking for the usage of non-uploaded
structure pages (official PDBs) in scenes. Each of these PDB pages
is edited such that it contains in the section named "About this
Structure" the string

<!-- PMBot Start -->
"The page TOPICPAGE refers to 1ABC."
or "The pages TOPICPAGES refer to 1ABC."
<!-- PMBot End -->

where TOPICPAGE is a topic page link, TOPICPAGES is a comma-separated
list of topic page links, and 1ABC is the name of the respective structure 
page. If such a string exists, it is actualized.

Options:
At the moment, there are no options.

"""
#
# (C) R Stephan 2009
#
# Distributed under the terms of the GPL2.
# 
__version__ = '0.10'
#

import wikipedia,re,sys,config
import catlib,traceback,itertools

wikipedia.get_throttle.setDelay(5)
#wikipedia.put_throttle.setDelay(10)

msg={
    'en': 'pmbot: maintenance of structure references',
    }

def main():
  Rco = re.compile (u'<!--(?:.(?<!--))*-->')
  Rt1 = re.compile (u'(?<=\[\[)[^\]]+(?=\]\])')
  Rt2 = re.compile (u' *\|.*')
  Rst = re.compile (u'(?<=STRUCTURE_)[1-9][0-9a-zA-Z][0-9a-zA-Z][0-9a-zA-Z]')
  Rap = re.compile (u'(?<=<applet load=.)[1-9][0-9a-zA-Z][0-9a-zA-Z][0-9a-zA-Z](?=[\'\"])')
  Rta = re.compile (u"Start of topic pages.*End of topic pages. Please DON'T REMOVE -->", re.DOTALL)
  site = wikipedia.getSite()
  dic = {}

# Try to read a topic pages list.
  pagename = 'Proteopedia:Topic_Pages'
  alltopics = wikipedia.Page (site, pagename)
  try:
    temp_text = alltopics.get (False, True)
  except wikipedia.NoPage:
    print 'NoPage exception when trying to read topic page list'
    return
#  with codecs.open('Topic_Pages.txt', encoding='utf-8') as f: temp_text = f.read()

  m = Rta.search(temp_text)
  if m == None:
    print 'Topic list markers not found.'
    return
  alltopics_text = Rco.sub (u'', m.group(0))
  topicsIter = Rt1.finditer (alltopics_text)

  c = 0
  for topicmatch in topicsIter:
#    if c>2: break
    c = c+1
    t = topicmatch.group(0)
    topicname = Rt2.sub (u'', t)
    
    # TODO: check if already loaded before
    loaded = False;
    while not loaded:
      sys.stdout.flush()
      print 'Retrieving ' + topicname.encode ('ascii', 'xmlcharrefreplace')
      sys.stdout.flush()
      topic = wikipedia.Page (site, topicname)
      try:
        loaded = True
        topic_text = topic.get()
      except wikipedia.NoPage:
        print 'NoPage exception when trying to read ' + topicname.encode ('ascii', 'xmlcharrefreplace')
        loaded = False
        break
      except wikipedia.SectionError:
        print 'Subject does not exist: ' + topicname.encode ('ascii', 'xmlcharrefreplace')
        topicname = re.sub (ur"#.*", '', topicname)
        loaded = False
        continue
      except wikipedia.IsRedirectPage, inst:
        topicname = inst.args[0]
        print 'Redirected to ' + topicname.encode ('ascii', 'xmlcharrefreplace')
        loaded = False
        continue
#    print topic_text.encode('utf-8')
    
    if not loaded: continue
    links = itertools.chain (
        Rt1.finditer (topic_text),
        Rst.finditer (topic_text),
        Rap.finditer (topic_text))
    for linkmatch in links:
      l = linkmatch.group(0)
      linkname = Rt2.sub ('', l)
      if linkname[0]>'0' and linkname[0]<='9':
        if linkname in dic:
          s = dic[string.lower(linkname)]
        else:
          s = set()
        s.add (topicname.encode('ascii', 'xmlcharrefreplace'))
        dic[string.lower(linkname)] = s
  print 'Number of topics read: ', c
  print 'Number of structures to read/write: ', len(dic)
  sys.stdout.flush()
  print dic

if __name__ == '__main__':
  for arg in wikipedia.handleArgs():
# - TODO: flag to switch from applet to scene backlinks to link backlinks
# - TODO: add option to search scene files
# - TODO: option to restrict number of topics read (c)
#    if arg.startswith("-p:"):
#      if (len(arg)) == len("-p:"):
#        pred = u"refers to"
#      else:
#        pred = arg[len("-p:"):]
        
  try:
    main()
  except:
    print 'Something wrong.'
    traceback.print_exc()
  finally:
    print 'Stop.'
    wikipedia.stopme()