Content deleted Content added
Hacked in PDF support from m:standardize_notes.py and added tagging of old 404 links |
+ language icons ; append <references/> when needed; bit of refactoring; encoding tweaks; PDF support |
||
Line 24: | Line 24: | ||
method from pagegenerators.py for performance and |
method from pagegenerators.py for performance and |
||
load issues |
load issues |
||
-xmlstart Page to start with when using an XML dump |
|||
Basic pagegenerators commands, -page, etc... |
Basic pagegenerators commands, -page, etc... |
||
Line 32: | Line 34: | ||
from BeautifulSoup import UnicodeDammit |
from BeautifulSoup import UnicodeDammit |
||
import sys, re, urllib2, httplib, socket, codecs |
import sys, re, urllib2, httplib, socket, codecs, ftplib |
||
import wikipedia, pagegenerators |
import wikipedia, pagegenerators |
||
import subprocess, tempfile, os |
|||
msg = { 'fr':u'Bot: Correction des refs. mal formatées (cf. [[Utilisateur:DumZiBoT/liensRefs|explications]])', |
msg = { 'fr':u'Bot: Correction des refs. mal formatées (cf. [[Utilisateur:DumZiBoT/liensRefs|explications]])', |
||
'de':u'Bot: Korrektes Referenzformat (siehe [[:en:User:DumZiBoT/refLinks]])', |
'de':u'Bot: Korrektes Referenzformat (siehe [[:en:User:DumZiBoT/refLinks]])', |
||
'en':u'Bot: Converting [[User:DumZiBoT/refLinks| |
'en':u'Bot: Converting bare references, see [[User:DumZiBoT/refLinks|FAQ]]'} |
||
lang_template = { 'fr':u'{{%s}}', |
|||
'en':u'{{%s icon}}'} |
|||
deadLinkTag = {'fr':u'', |
|||
'de':u'', |
|||
'en':u'{{dead link}}'} |
|||
comment = {'fr':u'Titre généré automatiquement', |
comment = {'fr':u'Titre généré automatiquement', |
||
Line 46: | Line 56: | ||
'de':u'Benutzer:DumZiBoT/EditThisPageToStopMe', |
'de':u'Benutzer:DumZiBoT/EditThisPageToStopMe', |
||
'en':u'User:DumZiBoT/EditThisPageToStopMe'} |
'en':u'User:DumZiBoT/EditThisPageToStopMe'} |
||
deadLinkTag = {'fr':u'', |
|||
'de':u'', |
|||
'en':u'{{dead link}}'} |
|||
soft404 |
soft404 = re.compile(ur'\D404(\D|\Z)|error|errdoc|Not.{0,3}Found|sitedown|eventlog', re.IGNORECASE) |
||
dirIndex |
dirIndex = re.compile(ur'^\w+://[^/]+/((default|index)\.(asp|aspx|cgi|htm|html|phtml|mpx|mspx|php|shtml|var))?$', re.IGNORECASE) |
||
domain |
domain = re.compile(ur'^(\w+)://(?:www.|)([^/]+)') |
||
# Scan in Jan 13, 2007 |
|||
# http://www.twoevils.org/files/wikipedia/404-links.txt.gz |
|||
listof404pages = '../../404-links.txt.gz' |
|||
badtitles = {'en': |
badtitles = {'en': |
||
# starts with |
# starts with |
||
ur'(?is)^\W*(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on|untitled)' |
ur'(?is)^\W*(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on|untitled *(document|page|$))' |
||
# anywhere |
# anywhere |
||
+ |
+'|(404|page|file).*not *found' |
||
# should never be |
|||
+ur'^(JSTOR: Accessing JSTOR)$' |
|||
# ends with |
# ends with |
||
+ |
+'|(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on)\W*$'} |
||
linksInRef = re.compile( |
linksInRef = re.compile( |
||
# bracketed URLs |
|||
ur'<ref(?P<name>[^>]*)>\s*\[?(?P<url>(?:http|https|ftp)://(?:' + |
|||
# unbracketed with() |
|||
ur'^\[\]\s<>"]+\([^\[\]\s<>"]+[^\[\]\s\.:;\\,<>\?"]+|'+ |
|||
# unbracketed without () |
|||
ur'[^\[\]\s<>"]+[^\[\]\s\)\.:;\\,<>\?"]+|[^\[\]\s<>"]+))[!?,\s]*\]?\s*</ref>') |
|||
#'http://www.twoevils.org/files/wikipedia/404-links.txt.gz' |
|||
listof404pages = '404-links.txt' |
|||
# References sections are usually placed before further reading / external |
|||
# link sections. This dictionary defines these sections, sorted by priority. |
|||
# For example, on an English wiki, the script would place the "References" |
|||
# section in front of the "Further reading" section, if that existed. |
|||
# Otherwise, it would try to put it in front of the "External links" section, |
|||
# or if that fails, the "See also" section, etc. |
|||
placeBeforeSections = { |
|||
'de': [ # no explicit policy on where to put the references |
|||
u'Literatur', |
|||
u'Weblinks', |
|||
u'Siehe auch', |
|||
u'Weblink', # bad, but common singular form of Weblinks |
|||
], |
|||
'en': [ # no explicit policy on where to put the references |
|||
u'Further reading', |
|||
u'External links', |
|||
u'See also', |
|||
u'Notes' |
|||
] |
|||
} |
|||
# Titles of sections where a reference tag would fit into. |
|||
# The first title should be the preferred one: It's the one that |
|||
# will be used when a new section has to be created. |
|||
referencesSections = { |
|||
'de': [ |
|||
u'Einzelnachweise', # The "Einzelnachweise" title is disputed, some people prefer the other variants |
|||
u'Quellen', |
|||
u'Quellenangaben', |
|||
u'Fußnoten', |
|||
], |
|||
'en': [ # not sure about which ones are preferred. |
|||
u'References', |
|||
u'Footnotes', |
|||
u'Notes', |
|||
] |
|||
} |
|||
referencesTemplates = { |
|||
'wikipedia': { |
|||
'en': [u'Reflist',u'Refs',u'FootnotesSmall',u'Reference', |
|||
u'Ref-list',u'Reference list',u'References-small',u'Reflink', |
|||
u'Footnotes',u'FootnotesSmall'], |
|||
}, |
|||
} |
|||
class XmlDumpPageGenerator: |
class XmlDumpPageGenerator: |
||
Line 106: | Line 155: | ||
if linksInRef.search(entry.text): |
if linksInRef.search(entry.text): |
||
return page |
return page |
||
class RefLink: |
|||
def __init__(self, link, name): |
|||
self.refname = name |
|||
self.link = link |
|||
self.site = wikipedia.getSite() |
|||
self.linkComment = wikipedia.translate(self.site, comment) |
|||
self.url = re.sub(u'#.*', '', self.link) |
|||
self.title = None |
|||
def refTitle(self): |
|||
if self.lang and self.lang != self.site.language(): |
|||
tl = wikipedia.translate(self.site, lang_template) % self.lang |
|||
tl = tl + ' ' |
|||
else: |
|||
tl ='' |
|||
return '<ref%s>%s[%s %s<!-- %s -->]</ref>' % (self.refname, tl, self.link, self.title, self.linkComment) |
|||
def refLink(self): |
|||
if self.lang and self.lang != self.site.language(): |
|||
tl = wikipedia.translate(self.site, lang_template) % self.lang |
|||
tl = tl + ' ' |
|||
else: |
|||
tl ='' |
|||
return '<ref%s>%s%s</ref>' % (self.refname, tl, self.link) |
|||
def refDead(self): |
|||
tag = wikipedia.translate(self.site, deadLinkTag) |
|||
return '<ref%s>[%s]%s</ref>' % (self.refname, self.link, tag) |
|||
def transform(self): |
|||
self.avoid_uppercase() |
|||
#avoid closing the link before the end |
|||
self.title = self.title.replace(']', ']') |
|||
#avoid multiple } being interpreted as a template inclusion |
|||
self.title = self.title.replace('}}', '}}') |
|||
#prevent multiple quotes being interpreted as '' or ''' |
|||
self.title = self.title.replace('\'\'', '\''') |
|||
self.title = wikipedia.unicode2html(self.title, self.site.encoding()) |
|||
def avoid_uppercase(self): |
|||
""" |
|||
If title has more than 6 characters and has 60% of uppercase |
|||
characters, capitalize() it |
|||
""" |
|||
if len(self.title) <= 6: |
|||
return |
|||
nb_upper = 0 |
|||
nb_letter = 0 |
|||
for letter in self.title: |
|||
if letter.isupper(): |
|||
nb_upper += 1 |
|||
if letter.isalpha(): |
|||
nb_letter += 1 |
|||
if letter.isdigit(): |
|||
return |
|||
if float(nb_upper)/(nb_letter+1) > .70: |
|||
self.title = self.title.title() |
|||
class ReferencesRobot: |
class ReferencesRobot: |
||
Line 117: | Line 224: | ||
self.stopPage).latestRevision() |
self.stopPage).latestRevision() |
||
self.META_CONTENT = re.compile(ur'(?i)<meta[^>]*content\-type[^>]*>') |
self.META_CONTENT = re.compile(ur'(?i)<meta[^>]*content\-type[^>]*>') |
||
self.CHARSET = re.compile(ur'(?i)charset\s*=\s*(?P<enc>[^;>/]*)') |
self.CHARSET = re.compile(ur'(?i)charset\s*=\s*(?P<enc>[^\'";>/]*)') |
||
self.META_LANG = re.compile(ur'(?i)<meta[^>]*content\-language[^>]*>') |
|||
self.LANG = re.compile(ur'(?i)content\s*=\s*(?P<lang>[^\'";>/]*)') |
|||
self.TITLE = re.compile(ur'(?is)(?<=<title>).*?(?=</title>)') |
self.TITLE = re.compile(ur'(?is)(?<=<title>).*?(?=</title>)') |
||
self.NON_HTML = re.compile(ur'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|<!--.*?-->|<!\[CDATA\[.*?\]\]>') |
self.NON_HTML = re.compile(ur'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|<!--.*?-->|<!\[CDATA\[.*?\]\]>') |
||
str = ur'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml' |
|||
self.MIME = re.compile( |
self.MIME = re.compile(str) |
||
self. |
self.titleBlackList = re.compile(wikipedia.translate(self.site, badtitles)) |
||
self. |
self.referencesR = re.compile('<references */>', re.IGNORECASE) |
||
try: |
|||
self.templateR = '{{(' + '|'.join(referencesTemplates[self.site.family.name][self.site.lang]) + ')' |
|||
except KeyError: |
|||
self.templateR ='' |
|||
def put_page(self, page, new): |
def put_page(self, page, new): |
||
""" |
""" |
||
Line 156: | Line 269: | ||
except wikipedia.ServerError, e: |
except wikipedia.ServerError, e: |
||
wikipedia.output(u'Server Error : %s' % e) |
wikipedia.output(u'Server Error : %s' % e) |
||
def refTitle(self, link, title): |
|||
comm = wikipedia.translate(self.site, comment) |
|||
return '<ref>[%s %s<!-- %s -->]</ref>' % (link, title, comm ) |
|||
def refLink(self, link): |
|||
return '<ref>%s</ref>' % link |
|||
def refDead(self, link): |
|||
tag = wikipedia.translate(self.site, deadLinkTag) |
|||
return '<ref>[%s]%s</ref>' % (link, tag) |
|||
def httpError(self, err_num, link, pagetitleaslink): |
def httpError(self, err_num, link, pagetitleaslink): |
||
Line 182: | Line 284: | ||
f.close() |
f.close() |
||
def |
def addReferences(self, text): |
||
""" |
""" |
||
Add <references/> when missing |
|||
If title has more than 6 characters and has 75% of uppercase |
|||
letters, title() it (Capitalize each word) |
|||
""" |
""" |
||
if |
if self.referencesR.search(text): |
||
return |
return text |
||
elif self.templateR and re.search(self.templateR, text, re.I): |
|||
nb_upper = 0 |
|||
return text |
|||
for |
for section in wikipedia.translate(self.site, referencesSections): |
||
sectionR = re.compile(r'\r\n=+ *%s *=+\r\n' % section) |
|||
index = 0 |
|||
while index < len(text): |
|||
match = sectionR.search(text, index) |
|||
if |
if match: |
||
if wikipedia.isDisabled(text, match.start()): |
|||
wikipedia.output('Existing %s section is commented out, skipping.' % section) |
|||
if (nb_upper*100)/nb_letter > 75: |
|||
index = match.end() |
|||
else: |
else: |
||
wikipedia.output(u'Adding references tag to existing %s section...\n' % section) |
|||
return title |
|||
return text[:match.end()] + u'\n<references/>\n' + text[match.end():] |
|||
else: |
|||
break |
|||
# Create a new section for the references tag |
|||
for section in wikipedia.translate(self.site, placeBeforeSections): |
|||
# Find out where to place the new section |
|||
sectionR = re.compile(r'\r\n=+ *%s *=+\r\n' % section) |
|||
index = 0 |
|||
while index < len(text): |
|||
match = sectionR.search(text, index) |
|||
if match: |
|||
if wikipedia.isDisabled(text, match.start()): |
|||
wikipedia.output('Existing %s section is commented out, won\'t add the references in front of it.' % section) |
|||
index = match.end() |
|||
else: |
|||
wikipedia.output(u'Adding references section before %s section...\n' % section) |
|||
index = match.start() |
|||
return self.createReferenceSection(text, index) |
|||
return |
|||
else: |
|||
break |
|||
# This gets complicated: we want to place the new references |
|||
# section over the interwiki links and categories, but also |
|||
# over all navigation bars, persondata, and other templates |
|||
# that are at the bottom of the page. So we need some advanced |
|||
# regex magic. |
|||
# The strategy is: create a temporary copy of the text. From that, |
|||
# keep removing interwiki links, templates etc. from the bottom. |
|||
# At the end, look at the length of the temp text. That's the position |
|||
# where we'll insert the references section. |
|||
catNamespaces = '|'.join(self.site.category_namespaces()) |
|||
categoryPattern = r'\[\[\s*(%s)\s*:[^\n]*\]\]\s*' % catNamespaces |
|||
interwikiPattern = r'\[\[([a-zA-Z\-]+)\s?:([^\[\]\n]*)\]\]\s*' |
|||
# won't work with nested templates |
|||
templatePattern = r'{{((?!}}).)+?}}\s*' # the negative lookahead assures that we'll match the last template occurence in the temp text. |
|||
commentPattern = r'<!--((?!-->).)*?-->\s*' |
|||
metadataR = re.compile(r'(\r\n)?(%s|%s|%s|%s)$' % (categoryPattern, interwikiPattern, templatePattern, commentPattern), re.DOTALL) |
|||
tmpText = text |
|||
while True: |
|||
match = metadataR.search(tmpText) |
|||
if match: |
|||
tmpText = tmpText[:match.start()] |
|||
else: |
|||
break |
|||
wikipedia.output(u'Found no section that can be preceeded by a new references section. Placing it before interwiki links, categories, and bottom templates.') |
|||
index = len(tmpText) |
|||
return self.createReferenceSection(text, index) |
|||
def createReferenceSection(self, text, index): |
|||
newSection = u'\n== %s ==\n\n<references/>\n' % wikipedia.translate(self.site, referencesSections)[0] |
|||
return text[:index] + newSection + text[index:] |
|||
def getPDFTitle(self, ref, f): |
|||
wikipedia.output( u'PDF file.' ) |
|||
fd, infile = tempfile.mkstemp() |
|||
urlobj = os.fdopen(fd, 'r+w') |
|||
urlobj.write(f.read()) |
|||
try: |
|||
pdfinfo_out = subprocess.Popen([r"pdfinfo","/dev/stdin"], stdin=urlobj, stdout=subprocess.PIPE, shell=False).communicate()[0] |
|||
for aline in pdfinfo_out.splitlines(): |
|||
if aline.lower().startswith('title'): |
|||
ref.title = aline.split(None)[1:] |
|||
ref.title = ' '.join(ref.title) |
|||
if ref.title != '': wikipedia.output(u'title: ' +ref.title ) |
|||
wikipedia.output( u'PDF done.' ) |
|||
except ValueError: |
|||
wikipedia.output( u'pdfinfo value error.' ) |
|||
except OSError: |
|||
wikipedia.output( u'pdfinfo OS error.' ) |
|||
except: # Ignore errors |
|||
wikipedia.output( u'PDF processing error.' ) |
|||
pass |
|||
finally: |
|||
urlobj.close() |
|||
os.unlink(infile) |
|||
def run(self): |
def run(self): |
||
Line 208: | Line 385: | ||
""" |
""" |
||
wikipedia.setAction(wikipedia.translate(self.site, msg)) |
wikipedia.setAction(wikipedia.translate(self.site, msg)) |
||
deadLinks = codecs.open(listof404pages, 'r', 'latin_1').read() |
|||
socket.setdefaulttimeout(30) |
|||
editedpages = 0 |
editedpages = 0 |
||
for page in self.generator: |
for page in self.generator: |
||
Line 224: | Line 403: | ||
continue |
continue |
||
for match in linksInRef.finditer(wikipedia.removeDisabledParts(page.get())): |
|||
# Commons fixes for URLs |
|||
new_text = re.sub(r'(http:?/+)+', 'http://', new_text) # Silently correct http://http:/ |
|||
new_text = re.sub(r"(\[\w+://[^][<>\"\s]*?)''", r"\1 ''", new_text) # HTML pre-convert markup |
|||
#new_text = re.sub(r'(\[http[s]?://[^][<>\s/]+)([ \]])', r'\1/\2', new_text) # adds / to the end of domains |
|||
for match in linksInRef.finditer(wikipedia.removeDisabledParts(new_text)): |
|||
#for each link to change |
#for each link to change |
||
link = match.group('url') |
link = match.group(u'url') |
||
#debugging purpose |
#debugging purpose |
||
#print link |
#print link |
||
if u'www.jstor.org' in link: |
|||
#TODO: Clean URL blacklist |
|||
continue |
|||
ref = RefLink(link, match.group('name')) |
|||
f = None |
|||
try: |
try: |
||
socket.setdefaulttimeout(20) |
|||
f = urllib2.urlopen(ref.url) |
|||
url = re.sub(u'#.*', '', link) |
|||
f = urllib2.urlopen(url) |
|||
#Try to get Content-Type from server |
#Try to get Content-Type from server |
||
headers = f.info() |
|||
contentType = headers.getheader('Content-Type') |
|||
#get the content language |
|||
ref.lang = headers.getheader('Content-Language') |
|||
if ref.lang: |
|||
ref.lang=ref.lang[:2].lower() |
|||
open(infile,"w").write( f.read() ) |
|||
urlobj = open( infile ) |
|||
try: |
|||
pdfinfo_out = subprocess.Popen([r"pdfinfo","/dev/stdin"], stdin=urlobj, stdout=subprocess.PIPE, shell=False).communicate()[0] |
|||
for aline in pdfinfo_out.splitlines(): |
|||
if aline.lower().startswith('title'): |
|||
urltitle = aline.split(None)[1:] |
|||
urltitle = ' '.join(urltitle) |
|||
if urltitle != '': |
|||
wikipedia.output(u'title: ' +urltitle ) |
|||
repl = self.refTitle(link, urltitle) |
|||
new_text = new_text.replace(match.group(), repl) |
|||
else: |
|||
if aline.lower().startswith('author'): |
|||
urlauthor = aline.split(None)[1:] |
|||
urlauthor = ' '.join(urlauthor) |
|||
if urlauthor != '': wikipedia.output(u'author: ' +urlauthor ) |
|||
except ValueError: |
|||
wikipedia.output( u'pdfinfo value error.' ) |
|||
except OSError: |
|||
wikipedia.output( u'pdfinfo OS error.' ) |
|||
except: # Ignore errors |
|||
wikipedia.output( u'PDF processing error.' ) |
|||
pass |
|||
wikipedia.output( u'PDF done.' ) |
|||
continue |
|||
if contentType and not self.MIME.search(contentType): |
if contentType and not self.MIME.search(contentType): |
||
wikipedia.output(u'\03{lightyellow}WARNING\03{default} : media : %s ' % link) |
wikipedia.output(u'\03{lightyellow}WARNING\03{default} : media : %s ' % ref.link) |
||
if ref.link.lower().endswith('.pdf'): |
|||
# If file has a PDF suffix |
|||
self.getPDFTitle(ref, f) |
|||
if ref.title: |
|||
ref.transform() |
|||
repl = ref.refTitle() |
|||
else: |
|||
repl = ref.refLink() |
|||
new_text = new_text.replace(match.group(), repl) |
new_text = new_text.replace(match.group(), repl) |
||
continue |
continue |
||
# Test if the redirect was valid |
# Test if the redirect was valid |
||
redir = f.geturl() |
redir = f.geturl() |
||
if redir != link and domain.findall(redir) == domain.findall(link): |
if redir != ref.link and domain.findall(redir) == domain.findall(link): |
||
if soft404.search(redir) and not soft404.search(link): |
if soft404.search(redir) and not soft404.search(ref.link): |
||
wikipedia.output(u'\03{lightyellow}WARNING\03{default} : Redirect 404 : %s ' % link) |
wikipedia.output(u'\03{lightyellow}WARNING\03{default} : Redirect 404 : %s ' % ref.link) |
||
continue |
continue |
||
if dirIndex.match(redir) and not dirIndex.match(link): |
if dirIndex.match(redir) and not dirIndex.match(ref.link): |
||
wikipedia.output(u'\03{lightyellow}WARNING\03{default} : Redirect to root : %s ' % link) |
wikipedia.output(u'\03{lightyellow}WARNING\03{default} : Redirect to root : %s ' % ref.link) |
||
continue |
continue |
||
# Read the first 1,000,000 bytes (0.95 MB) |
|||
linkedpagetext = f.read(1000000) |
linkedpagetext = f.read(1000000) |
||
socket.setdefaulttimeout(None) |
socket.setdefaulttimeout(None) |
||
Line 295: | Line 452: | ||
except UnicodeError: |
except UnicodeError: |
||
#example : http://www.adminet.com/jo/20010615¦/ECOC0100037D.html in [[fr:Cyanure]] |
#example : http://www.adminet.com/jo/20010615¦/ECOC0100037D.html in [[fr:Cyanure]] |
||
wikipedia.output(u'\03{lightred}Bad link\03{default} : %s in %s' % (url, page.aslink())) |
wikipedia.output(u'\03{lightred}Bad link\03{default} : %s in %s' % (ref.url, page.aslink())) |
||
continue |
continue |
||
except urllib2.HTTPError, e: |
except urllib2.HTTPError, e: |
||
self.httpError(e.code, url, page.aslink()) |
self.httpError(e.code, ref.url, page.aslink()) |
||
if e.code == 410: # 410 Gone, indicates that the resource has been purposely removed |
if e.code == 410: # 410 Gone, indicates that the resource has been purposely removed |
||
repl = |
repl = ref.refDead() |
||
new_text = new_text.replace(match.group(), repl) |
new_text = new_text.replace(match.group(), repl) |
||
elif e.code == 404 and (u'\t%s\t' % url in |
elif e.code == 404 and (u'\t%s\t' % ref.url in deadLinks): |
||
repl = |
repl = ref.refDead() |
||
new_text = new_text.replace(match.group(), repl) |
new_text = new_text.replace(match.group(), repl) |
||
continue |
continue |
||
except (urllib2.URLError, |
except (urllib2.URLError, |
||
socket.error, |
|||
IOError, |
|||
httplib.error), e: |
|||
#except (urllib2.URLError, socket.timeout, ftplib.error, httplib.error, socket.error), e: |
|||
wikipedia.output(u'Can\'t get page %s : %s' % (ref.url, e)) |
|||
continue |
continue |
||
except ValueError: |
except ValueError: |
||
Line 313: | Line 474: | ||
#"httplib raises ValueError reading chunked content" |
#"httplib raises ValueError reading chunked content" |
||
continue |
continue |
||
finally: |
|||
if f: |
|||
f.close() |
|||
#remove <script>/<style>/comments/CDATA tags |
#remove <script>/<style>/comments/CDATA tags |
||
linkedpagetext = self.NON_HTML.sub('', linkedpagetext) |
linkedpagetext = self.NON_HTML.sub('', linkedpagetext) |
||
if not ref.lang: |
|||
meta_lang = self.META_LANG.search(linkedpagetext) |
|||
if meta_lang: |
|||
tag = meta_lang.group() |
|||
m = self.LANG.search(tag) |
|||
if m: |
|||
tmp = m.group('lang').strip("\"' ").lower() |
|||
ref.lang = tmp[:2] |
|||
meta_content = self.META_CONTENT.search(linkedpagetext) |
meta_content = self.META_CONTENT.search(linkedpagetext) |
||
enc = |
enc = [] |
||
if meta_content: |
if meta_content: |
||
tag = meta_content.group() |
tag = meta_content.group() |
||
Line 324: | Line 498: | ||
s = self.CHARSET.search(tag) |
s = self.CHARSET.search(tag) |
||
if s: |
if s: |
||
tmp = s.group('enc').strip("\"' ").lower() |
|||
enc.append(tmp) |
|||
if tmp in ("gb 2312", "gb2312", "gb-2312", "gb_2312"): |
|||
enc.append("gbk") |
|||
if not ref.lang: |
|||
ref.lang="zh" |
|||
if tmp in ("shift jis", "shiftjis", "shift-jis", "shift_jis"): |
|||
enc.append("shift jis 2004") |
|||
enc.append("cp932") |
|||
if not ref.lang: |
|||
ref.lang="ja" |
|||
if tmp in ("x euc jp", "x-euc-jp"): |
|||
enc.append("euc-jp") |
|||
if not ref.lang: |
|||
ref.lang="ja" |
|||
if not contentType: |
if not contentType: |
||
wikipedia.output(u'No content-type found for %s' % link) |
wikipedia.output(u'No content-type found for %s' % ref.link) |
||
continue |
continue |
||
elif not self.MIME.search(contentType): |
elif not self.MIME.search(contentType): |
||
wikipedia.output(u'\03{lightyellow}WARNING\03{default} : media : %s ' % link) |
wikipedia.output(u'\03{lightyellow}WARNING\03{default} : media : %s ' % ref.link) |
||
repl = ref.refLink() |
|||
new_text = new_text.replace(match.group(), repl) |
|||
continue |
continue |
||
if u'.ru' in ref.link or ref.lang == "ru": |
|||
# see http://www.sci.aha.ru/ATL/ra13a.htm : no server encoding, no page encoding |
|||
enc.append(u'windows-1251') |
|||
print(enc) |
|||
u = UnicodeDammit(linkedpagetext, overrideEncodings = enc) |
|||
print(u.triedEncodings) |
|||
u = UnicodeDammit(linkedpagetext, overrideEncodings = [enc]) |
|||
titre = None |
|||
if not u.unicode: |
if not u.unicode: |
||
#Some page have utf-8 AND windows-1252 characters, |
#Some page have utf-8 AND windows-1252 characters, |
||
#Can't easily parse them. (~1 on 1000) |
#Can't easily parse them. (~1 on 1000) |
||
repl = ref.refLink() |
|||
new_text = new_text.replace(match.group(), repl) |
|||
wikipedia.output('%s : Hybrid encoding...' % ref.link) |
|||
continue |
continue |
||
Line 351: | Line 547: | ||
#remove \n and \r and Unicode spaces from titles |
#remove \n and \r and Unicode spaces from titles |
||
t = re.sub(r'(?u)\s', ' ', t) |
t = re.sub(r'(?u)\s', ' ', t) |
||
t = re.sub(r'[\n\r\t]', ' ', t) |
|||
#remove extra whitespaces |
#remove extra whitespaces |
||
#remove leading and trailing ./;/,/-/_/+/ / |
#remove leading and trailing ./;/,/-/_/+/ / |
||
t = re.sub(r' +', ' ', t.strip('=.;,-+_ ')) |
t = re.sub(r' +', ' ', t.strip(r'=.;,-+_ ')) |
||
if t: |
if t: |
||
ref.title = t |
|||
break; |
break; |
||
# Check to see if the documents contains 3 time as many printed characters as the title |
|||
if not ref.title: |
|||
repl = ref.refLink() |
|||
#doctext = re.sub('(?u)\s+', ' ', doctext) |
|||
#if len(doctext) < len(titre) * 3: |
|||
# wikipedia.output(u'%s : Page is too short' % link) |
|||
# continue |
|||
if not titre: |
|||
repl = self.refLink(link) |
|||
new_text = new_text.replace(match.group(), repl) |
new_text = new_text.replace(match.group(), repl) |
||
wikipedia.output(u'%s : No title found...' % link) |
wikipedia.output(u'%s : No title found...' % ref.link) |
||
continue |
continue |
||
if u |
if enc and u.originalEncoding not in enc: |
||
wikipedia.output(u'\03{lightpurple}ENCODING\03{default} : %s (%s)' % (ref.link, ref.title)) |
|||
if u'é' in ref.title: |
|||
repl = ref.refLink() |
|||
new_text = new_text.replace(match.group(), repl) |
new_text = new_text.replace(match.group(), repl) |
||
wikipedia.output(u'%s : Hybrid encoding...' % link) |
wikipedia.output(u'%s : Hybrid encoding...' % ref.link) |
||
continue |
continue |
||
if self.titleBlackList.search( |
if self.titleBlackList.search(ref.title): |
||
repl = |
repl = ref.refLink() |
||
new_text = new_text.replace(match.group(), repl) |
new_text = new_text.replace(match.group(), repl) |
||
wikipedia.output(u'%s : Blacklisted title (%s)' % (link, |
wikipedia.output(u'\03{lightred}WARNING\03{default} %s : Blacklisted title (%s)' % (ref.link, ref.title)) |
||
continue |
continue |
||
ref.transform() |
|||
#avoid closing the link before the end |
|||
titre = titre.replace(']', ']') |
|||
#avoid multiple } being interpreted as a template inclusion |
|||
titre = titre.replace('}}', '}}') |
|||
#prevent multiple quotes being interpreted as '' or ''' |
|||
titre = titre.replace('\'\'', '\''') |
|||
titre = wikipedia.unicode2html(titre, self.site.encoding()) |
|||
repl = |
repl = ref.refTitle() |
||
new_text = new_text.replace(match.group(), repl) |
new_text = new_text.replace(match.group(), repl) |
||
Line 401: | Line 584: | ||
continue |
continue |
||
new_text = self.addReferences(new_text) |
|||
editedpages += 1 |
editedpages += 1 |
||
self.put_page(page, new_text) |
self.put_page(page, new_text) |
||
Line 411: | Line 595: | ||
self.stopPage).latestRevision() |
self.stopPage).latestRevision() |
||
if actualRev != self.stopPageRevId: |
if actualRev != self.stopPageRevId: |
||
wikipedia.output(u'%s has been edited : Someone wants us to stop.' % self.stopPage) |
wikipedia.output(u'[[%s]] has been edited : Someone wants us to stop.' % self.stopPage) |
||
return |
return |
||
if self.site.messages: |
if self.site.messages: |
Revision as of 17:41, 12 February 2008
Source of User:DumZiBoT/refLinks task.
Please edit this page if you think that my code needs improvements : The fact that I released this code on a wiki page is not meaningless.
That'd be nice to poke me if you plan to reuse my code, but again, it's up to you.
# -*- coding: utf-8 -*-
"""
This bot will search for references which are only made of a link
without title, (i.e. <ref>[http://www.google.fr/]</ref> or
<ref>http://www.google.fr/</ref>) and will fetch the html title from
the link to use it as the title of the wiki link in the reference, i.e.
<ref>[http://www.google.fr/search?q=test test - Google Search]</ref>
The bot checks every 20 edits its talk page and a special stop page : if
one of these page has been edited, it stops.
¶ms;
-limit:n Stops after n edits
-xml:dump.xml Should be used instead of a simple page fetching
method from pagegenerators.py for performance and
load issues
-xmlstart Page to start with when using an XML dump
Basic pagegenerators commands, -page, etc...
"""
# (C) 2008 - Nicolas Dumazet ( en:User:NicDumZ )
#
# Distributed under the terms of the GPL
from BeautifulSoup import UnicodeDammit
import sys, re, urllib2, httplib, socket, codecs, ftplib
import wikipedia, pagegenerators
import subprocess, tempfile, os
msg = { 'fr':u'Bot: Correction des refs. mal formatées (cf. [[Utilisateur:DumZiBoT/liensRefs|explications]])',
'de':u'Bot: Korrektes Referenzformat (siehe [[:en:User:DumZiBoT/refLinks]])',
'en':u'Bot: Converting bare references, see [[User:DumZiBoT/refLinks|FAQ]]'}
lang_template = { 'fr':u'{{%s}}',
'en':u'{{%s icon}}'}
deadLinkTag = {'fr':u'',
'de':u'',
'en':u'{{dead link}}'}
comment = {'fr':u'Titre généré automatiquement',
'de':u'Automatisch generierter titel',
'en':u'Bot generated title'}
stopPage = {'fr':u'Utilisateur:DumZiBoT/EditezCettePagePourMeStopper',
'de':u'Benutzer:DumZiBoT/EditThisPageToStopMe',
'en':u'User:DumZiBoT/EditThisPageToStopMe'}
soft404 = re.compile(ur'\D404(\D|\Z)|error|errdoc|Not.{0,3}Found|sitedown|eventlog', re.IGNORECASE)
dirIndex = re.compile(ur'^\w+://[^/]+/((default|index)\.(asp|aspx|cgi|htm|html|phtml|mpx|mspx|php|shtml|var))?$', re.IGNORECASE)
domain = re.compile(ur'^(\w+)://(?:www.|)([^/]+)')
badtitles = {'en':
# starts with
ur'(?is)^\W*(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on|untitled *(document|page|$))'
# anywhere
+'|(404|page|file).*not *found'
# ends with
+'|(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on)\W*$'}
linksInRef = re.compile(
# bracketed URLs
ur'<ref(?P<name>[^>]*)>\s*\[?(?P<url>(?:http|https|ftp)://(?:' +
# unbracketed with()
ur'^\[\]\s<>"]+\([^\[\]\s<>"]+[^\[\]\s\.:;\\,<>\?"]+|'+
# unbracketed without ()
ur'[^\[\]\s<>"]+[^\[\]\s\)\.:;\\,<>\?"]+|[^\[\]\s<>"]+))[!?,\s]*\]?\s*</ref>')
#'http://www.twoevils.org/files/wikipedia/404-links.txt.gz'
listof404pages = '404-links.txt'
# References sections are usually placed before further reading / external
# link sections. This dictionary defines these sections, sorted by priority.
# For example, on an English wiki, the script would place the "References"
# section in front of the "Further reading" section, if that existed.
# Otherwise, it would try to put it in front of the "External links" section,
# or if that fails, the "See also" section, etc.
placeBeforeSections = {
'de': [ # no explicit policy on where to put the references
u'Literatur',
u'Weblinks',
u'Siehe auch',
u'Weblink', # bad, but common singular form of Weblinks
],
'en': [ # no explicit policy on where to put the references
u'Further reading',
u'External links',
u'See also',
u'Notes'
]
}
# Titles of sections where a reference tag would fit into.
# The first title should be the preferred one: It's the one that
# will be used when a new section has to be created.
referencesSections = {
'de': [
u'Einzelnachweise', # The "Einzelnachweise" title is disputed, some people prefer the other variants
u'Quellen',
u'Quellenangaben',
u'Fußnoten',
],
'en': [ # not sure about which ones are preferred.
u'References',
u'Footnotes',
u'Notes',
]
}
referencesTemplates = {
'wikipedia': {
'en': [u'Reflist',u'Refs',u'FootnotesSmall',u'Reference',
u'Ref-list',u'Reference list',u'References-small',u'Reflink',
u'Footnotes',u'FootnotesSmall'],
},
}
class XmlDumpPageGenerator:
def __init__(self, xmlFilename, xmlStart, namespaces):
self.xmlFilename = xmlFilename
self.xmlStart = xmlStart
self.namespaces = namespaces
self.skipping = bool(xmlStart)
self.site = wikipedia.getSite()
import xmlreader
dump = xmlreader.XmlDump(self.xmlFilename)
self.parser = dump.parse()
def __iter__(self):
return self
def next(self):
while True:
try:
entry = self.parser.next()
except StopIteration:
raise
if self.skipping:
if entry.title != self.xmlStart:
continue
self.skipping = False
page=wikipedia.Page(self.site, entry.title)
if not self.namespaces == []:
if page.namespace() not in self.namespaces:
continue
if linksInRef.search(entry.text):
return page
class RefLink:
def __init__(self, link, name):
self.refname = name
self.link = link
self.site = wikipedia.getSite()
self.linkComment = wikipedia.translate(self.site, comment)
self.url = re.sub(u'#.*', '', self.link)
self.title = None
def refTitle(self):
if self.lang and self.lang != self.site.language():
tl = wikipedia.translate(self.site, lang_template) % self.lang
tl = tl + ' '
else:
tl =''
return '<ref%s>%s[%s %s<!-- %s -->]</ref>' % (self.refname, tl, self.link, self.title, self.linkComment)
def refLink(self):
if self.lang and self.lang != self.site.language():
tl = wikipedia.translate(self.site, lang_template) % self.lang
tl = tl + ' '
else:
tl =''
return '<ref%s>%s%s</ref>' % (self.refname, tl, self.link)
def refDead(self):
tag = wikipedia.translate(self.site, deadLinkTag)
return '<ref%s>[%s]%s</ref>' % (self.refname, self.link, tag)
def transform(self):
self.avoid_uppercase()
#avoid closing the link before the end
self.title = self.title.replace(']', ']')
#avoid multiple } being interpreted as a template inclusion
self.title = self.title.replace('}}', '}}')
#prevent multiple quotes being interpreted as '' or '''
self.title = self.title.replace('\'\'', '\''')
self.title = wikipedia.unicode2html(self.title, self.site.encoding())
def avoid_uppercase(self):
"""
If title has more than 6 characters and has 60% of uppercase
characters, capitalize() it
"""
if len(self.title) <= 6:
return
nb_upper = 0
nb_letter = 0
for letter in self.title:
if letter.isupper():
nb_upper += 1
if letter.isalpha():
nb_letter += 1
if letter.isdigit():
return
if float(nb_upper)/(nb_letter+1) > .70:
self.title = self.title.title()
class ReferencesRobot:
def __init__(self, generator, acceptall = False, limit = None):
self.generator = generator
self.acceptall = acceptall
self.limit = limit
self.site = wikipedia.getSite()
self.stopPage = wikipedia.translate(self.site, stopPage)
self.stopPageRevId = wikipedia.Page(self.site,
self.stopPage).latestRevision()
self.META_CONTENT = re.compile(ur'(?i)<meta[^>]*content\-type[^>]*>')
self.CHARSET = re.compile(ur'(?i)charset\s*=\s*(?P<enc>[^\'";>/]*)')
self.META_LANG = re.compile(ur'(?i)<meta[^>]*content\-language[^>]*>')
self.LANG = re.compile(ur'(?i)content\s*=\s*(?P<lang>[^\'";>/]*)')
self.TITLE = re.compile(ur'(?is)(?<=<title>).*?(?=</title>)')
self.NON_HTML = re.compile(ur'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|<!--.*?-->|<!\[CDATA\[.*?\]\]>')
str = ur'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml'
self.MIME = re.compile(str)
self.titleBlackList = re.compile(wikipedia.translate(self.site, badtitles))
self.referencesR = re.compile('<references */>', re.IGNORECASE)
try:
self.templateR = '{{(' + '|'.join(referencesTemplates[self.site.family.name][self.site.lang]) + ')'
except KeyError:
self.templateR =''
def put_page(self, page, new):
"""
Prints diffs between orginal and new (text), puts new text for page
"""
wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
% page.title())
wikipedia.showDiff(page.get(), new)
if not self.acceptall:
choice = wikipedia.inputChoice(u'Do you want to accept ' +
u'these changes?',
['Yes', 'No', 'All'],
['y', 'N', 'a'], 'N')
if choice in ['a', 'A']:
self.acceptall = True
if choice in ['y', 'Y']:
page.put_async(new)
if self.acceptall:
try:
page.put(new)
except wikipedia.EditConflict:
wikipedia.output(u'Skipping %s because of edit conflict'
% (page.title(),))
except wikipedia.SpamfilterError, e:
wikipedia.output(u'Cannot change %s because of blacklist entry %s' % (page.title(), e.url))
except wikipedia.PageNotSaved, error:
wikipedia.output(u'Error putting page: %s' % (error.args,))
except wikipedia.LockedPage:
wikipedia.output(u'Skipping %s (locked page)'
% (page.title(),))
except wikipedia.ServerError, e:
wikipedia.output(u'Server Error : %s' % e)
def httpError(self, err_num, link, pagetitleaslink):
"""Log HTTP Error"""
wikipedia.output(u'HTTP error (%s) for %s on %s'
% (err_num, link, pagetitleaslink),
toStdout = True)
f = codecs.open(
wikipedia.config.datafilepath(
'reflinks-httpErrorLog',
'reflinks-%s-%s.txt' % (self.site.family.name,
self.site.lang)),
'a', 'utf-8')
f.write(u'%s: %s from %s\n' % (err_num, link, pagetitleaslink))
f.close()
def addReferences(self, text):
"""
Add <references/> when missing
"""
if self.referencesR.search(text):
return text
elif self.templateR and re.search(self.templateR, text, re.I):
return text
for section in wikipedia.translate(self.site, referencesSections):
sectionR = re.compile(r'\r\n=+ *%s *=+\r\n' % section)
index = 0
while index < len(text):
match = sectionR.search(text, index)
if match:
if wikipedia.isDisabled(text, match.start()):
wikipedia.output('Existing %s section is commented out, skipping.' % section)
index = match.end()
else:
wikipedia.output(u'Adding references tag to existing %s section...\n' % section)
return text[:match.end()] + u'\n<references/>\n' + text[match.end():]
else:
break
# Create a new section for the references tag
for section in wikipedia.translate(self.site, placeBeforeSections):
# Find out where to place the new section
sectionR = re.compile(r'\r\n=+ *%s *=+\r\n' % section)
index = 0
while index < len(text):
match = sectionR.search(text, index)
if match:
if wikipedia.isDisabled(text, match.start()):
wikipedia.output('Existing %s section is commented out, won\'t add the references in front of it.' % section)
index = match.end()
else:
wikipedia.output(u'Adding references section before %s section...\n' % section)
index = match.start()
return self.createReferenceSection(text, index)
return
else:
break
# This gets complicated: we want to place the new references
# section over the interwiki links and categories, but also
# over all navigation bars, persondata, and other templates
# that are at the bottom of the page. So we need some advanced
# regex magic.
# The strategy is: create a temporary copy of the text. From that,
# keep removing interwiki links, templates etc. from the bottom.
# At the end, look at the length of the temp text. That's the position
# where we'll insert the references section.
catNamespaces = '|'.join(self.site.category_namespaces())
categoryPattern = r'\[\[\s*(%s)\s*:[^\n]*\]\]\s*' % catNamespaces
interwikiPattern = r'\[\[([a-zA-Z\-]+)\s?:([^\[\]\n]*)\]\]\s*'
# won't work with nested templates
templatePattern = r'{{((?!}}).)+?}}\s*' # the negative lookahead assures that we'll match the last template occurence in the temp text.
commentPattern = r'<!--((?!-->).)*?-->\s*'
metadataR = re.compile(r'(\r\n)?(%s|%s|%s|%s)$' % (categoryPattern, interwikiPattern, templatePattern, commentPattern), re.DOTALL)
tmpText = text
while True:
match = metadataR.search(tmpText)
if match:
tmpText = tmpText[:match.start()]
else:
break
wikipedia.output(u'Found no section that can be preceeded by a new references section. Placing it before interwiki links, categories, and bottom templates.')
index = len(tmpText)
return self.createReferenceSection(text, index)
def createReferenceSection(self, text, index):
newSection = u'\n== %s ==\n\n<references/>\n' % wikipedia.translate(self.site, referencesSections)[0]
return text[:index] + newSection + text[index:]
def getPDFTitle(self, ref, f):
wikipedia.output( u'PDF file.' )
fd, infile = tempfile.mkstemp()
urlobj = os.fdopen(fd, 'r+w')
urlobj.write(f.read())
try:
pdfinfo_out = subprocess.Popen([r"pdfinfo","/dev/stdin"], stdin=urlobj, stdout=subprocess.PIPE, shell=False).communicate()[0]
for aline in pdfinfo_out.splitlines():
if aline.lower().startswith('title'):
ref.title = aline.split(None)[1:]
ref.title = ' '.join(ref.title)
if ref.title != '': wikipedia.output(u'title: ' +ref.title )
wikipedia.output( u'PDF done.' )
except ValueError:
wikipedia.output( u'pdfinfo value error.' )
except OSError:
wikipedia.output( u'pdfinfo OS error.' )
except: # Ignore errors
wikipedia.output( u'PDF processing error.' )
pass
finally:
urlobj.close()
os.unlink(infile)
def run(self):
"""
Runs the Bot
"""
wikipedia.setAction(wikipedia.translate(self.site, msg))
deadLinks = codecs.open(listof404pages, 'r', 'latin_1').read()
socket.setdefaulttimeout(30)
editedpages = 0
for page in self.generator:
try:
# Load the page's text from the wiki
new_text = page.get()
if not page.canBeEdited():
wikipedia.output(u"You can't edit page %s"
% page.aslink())
continue
except wikipedia.NoPage:
wikipedia.output(u'Page %s not found' % page.aslink())
continue
except wikipedia.IsRedirectPage:
wikipedia.output(u'Page %s is a redirect' % page.aslink())
continue
for match in linksInRef.finditer(wikipedia.removeDisabledParts(page.get())):
#for each link to change
link = match.group(u'url')
#debugging purpose
#print link
if u'www.jstor.org' in link:
#TODO: Clean URL blacklist
continue
ref = RefLink(link, match.group('name'))
f = None
try:
socket.setdefaulttimeout(20)
f = urllib2.urlopen(ref.url)
#Try to get Content-Type from server
headers = f.info()
contentType = headers.getheader('Content-Type')
#get the content language
ref.lang = headers.getheader('Content-Language')
if ref.lang:
ref.lang=ref.lang[:2].lower()
if contentType and not self.MIME.search(contentType):
wikipedia.output(u'\03{lightyellow}WARNING\03{default} : media : %s ' % ref.link)
if ref.link.lower().endswith('.pdf'):
# If file has a PDF suffix
self.getPDFTitle(ref, f)
if ref.title:
ref.transform()
repl = ref.refTitle()
else:
repl = ref.refLink()
new_text = new_text.replace(match.group(), repl)
continue
# Test if the redirect was valid
redir = f.geturl()
if redir != ref.link and domain.findall(redir) == domain.findall(link):
if soft404.search(redir) and not soft404.search(ref.link):
wikipedia.output(u'\03{lightyellow}WARNING\03{default} : Redirect 404 : %s ' % ref.link)
continue
if dirIndex.match(redir) and not dirIndex.match(ref.link):
wikipedia.output(u'\03{lightyellow}WARNING\03{default} : Redirect to root : %s ' % ref.link)
continue
# Read the first 1,000,000 bytes (0.95 MB)
linkedpagetext = f.read(1000000)
socket.setdefaulttimeout(None)
except UnicodeError:
#example : http://www.adminet.com/jo/20010615¦/ECOC0100037D.html in [[fr:Cyanure]]
wikipedia.output(u'\03{lightred}Bad link\03{default} : %s in %s' % (ref.url, page.aslink()))
continue
except urllib2.HTTPError, e:
self.httpError(e.code, ref.url, page.aslink())
if e.code == 410: # 410 Gone, indicates that the resource has been purposely removed
repl = ref.refDead()
new_text = new_text.replace(match.group(), repl)
elif e.code == 404 and (u'\t%s\t' % ref.url in deadLinks):
repl = ref.refDead()
new_text = new_text.replace(match.group(), repl)
continue
except (urllib2.URLError,
socket.error,
IOError,
httplib.error), e:
#except (urllib2.URLError, socket.timeout, ftplib.error, httplib.error, socket.error), e:
wikipedia.output(u'Can\'t get page %s : %s' % (ref.url, e))
continue
except ValueError:
#Known bug of httplib, google for :
#"httplib raises ValueError reading chunked content"
continue
finally:
if f:
f.close()
#remove <script>/<style>/comments/CDATA tags
linkedpagetext = self.NON_HTML.sub('', linkedpagetext)
if not ref.lang:
meta_lang = self.META_LANG.search(linkedpagetext)
if meta_lang:
tag = meta_lang.group()
m = self.LANG.search(tag)
if m:
tmp = m.group('lang').strip("\"' ").lower()
ref.lang = tmp[:2]
meta_content = self.META_CONTENT.search(linkedpagetext)
enc = []
if meta_content:
tag = meta_content.group()
if not contentType:
contentType = tag
s = self.CHARSET.search(tag)
if s:
tmp = s.group('enc').strip("\"' ").lower()
enc.append(tmp)
if tmp in ("gb 2312", "gb2312", "gb-2312", "gb_2312"):
enc.append("gbk")
if not ref.lang:
ref.lang="zh"
if tmp in ("shift jis", "shiftjis", "shift-jis", "shift_jis"):
enc.append("shift jis 2004")
enc.append("cp932")
if not ref.lang:
ref.lang="ja"
if tmp in ("x euc jp", "x-euc-jp"):
enc.append("euc-jp")
if not ref.lang:
ref.lang="ja"
if not contentType:
wikipedia.output(u'No content-type found for %s' % ref.link)
continue
elif not self.MIME.search(contentType):
wikipedia.output(u'\03{lightyellow}WARNING\03{default} : media : %s ' % ref.link)
repl = ref.refLink()
new_text = new_text.replace(match.group(), repl)
continue
if u'.ru' in ref.link or ref.lang == "ru":
# see http://www.sci.aha.ru/ATL/ra13a.htm : no server encoding, no page encoding
enc.append(u'windows-1251')
print(enc)
u = UnicodeDammit(linkedpagetext, overrideEncodings = enc)
print(u.triedEncodings)
if not u.unicode:
#Some page have utf-8 AND windows-1252 characters,
#Can't easily parse them. (~1 on 1000)
repl = ref.refLink()
new_text = new_text.replace(match.group(), repl)
wikipedia.output('%s : Hybrid encoding...' % ref.link)
continue
for m in self.TITLE.finditer(u.unicode):
t = m.group()
if t:
#convert html entities
t = wikipedia.html2unicode(t)
t = re.sub(r'-+', '-', t)
#remove formatting, i.e long useless strings
t = re.sub(r'[\.+\-=]{4,}', ' ', t)
#remove \n and \r and Unicode spaces from titles
t = re.sub(r'(?u)\s', ' ', t)
t = re.sub(r'[\n\r\t]', ' ', t)
#remove extra whitespaces
#remove leading and trailing ./;/,/-/_/+/ /
t = re.sub(r' +', ' ', t.strip(r'=.;,-+_ '))
if t:
ref.title = t
break;
if not ref.title:
repl = ref.refLink()
new_text = new_text.replace(match.group(), repl)
wikipedia.output(u'%s : No title found...' % ref.link)
continue
if enc and u.originalEncoding not in enc:
wikipedia.output(u'\03{lightpurple}ENCODING\03{default} : %s (%s)' % (ref.link, ref.title))
if u'é' in ref.title:
repl = ref.refLink()
new_text = new_text.replace(match.group(), repl)
wikipedia.output(u'%s : Hybrid encoding...' % ref.link)
continue
if self.titleBlackList.search(ref.title):
repl = ref.refLink()
new_text = new_text.replace(match.group(), repl)
wikipedia.output(u'\03{lightred}WARNING\03{default} %s : Blacklisted title (%s)' % (ref.link, ref.title))
continue
ref.transform()
repl = ref.refTitle()
new_text = new_text.replace(match.group(), repl)
if new_text == page.get():
wikipedia.output('No changes were necessary in %s'
% page.aslink())
continue
new_text = self.addReferences(new_text)
editedpages += 1
self.put_page(page, new_text)
if self.limit and editedpages >= self.limit:
wikipedia.output('Edited %s pages, stopping.' % self.limit)
return
if editedpages % 20 == 0:
wikipedia.output('\03{lightgreen}Checking stop page...\03{default}')
actualRev = wikipedia.Page(self.site,
self.stopPage).latestRevision()
if actualRev != self.stopPageRevId:
wikipedia.output(u'[[%s]] has been edited : Someone wants us to stop.' % self.stopPage)
return
if self.site.messages:
wikipedia.output(u'Bot has new messages. Better stop to check.')
return
def main():
genFactory = pagegenerators.GeneratorFactory()
PageTitles = []
xmlFilename = None
always = False
limit = None
namespaces = []
generator = None
for arg in wikipedia.handleArgs():
if arg.startswith('-page:'):
PageTitles.append(arg[6:])
elif arg.startswith('-namespace:'):
try:
namespaces.append(int(arg[11:]))
except ValueError:
namespaces.append(arg[11:])
elif arg.startswith('-summary:'):
wikipedia.setAction(arg[9:])
elif arg == '-always':
always = True
elif arg.startswith('-limit:'):
limit = int(arg[7:])
elif arg.startswith('-xmlstart'):
if len(arg) == 9:
xmlStart = wikipedia.input(
u'Please enter the dumped article to start with:')
else:
xmlStart = arg[10:]
elif arg.startswith('-xml'):
if len(arg) == 4:
xmlFilename = wikipedia.input(
u'Please enter the XML dump\'s filename:')
else:
xmlFilename = arg[5:]
else:
generator = genFactory.handleArg(arg)
if xmlFilename:
try:
xmlStart
except NameError:
xmlStart = None
generator = XmlDumpPageGenerator(xmlFilename, xmlStart, namespaces)
elif PageTitles:
pages= [wikipedia.Page(wikipedia.getSite(), PageTitle) for PageTitle in PageTitles]
generator = iter(pages)
if not generator:
# syntax error, show help text from the top of this file
wikipedia.showHelp('reflinks')
wikipedia.stopme()
sys.exit()
generator = pagegenerators.PreloadingGenerator(generator, pageNumber = 50)
generator = pagegenerators.RedirectFilterPageGenerator(generator)
bot = ReferencesRobot(generator, always, limit)
bot.run()
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()