Uzanto:Maksim/sozd sta
- cht_x_sta.py
# -*- coding: UTF-8 -*-
__version__='$Id:'
import sys, codecs, re, os
import wikipedia, date, xmlreader
from bib_tekst import *
def provstop(fstop):
return os.access(fstop,os.F_OK)
def main():
fstop='cht_x_sta_stop.txt'
try:
os.unlink(fstop)
except:
pass
if provstop(fstop):
wikipedia.output(u'stop0!\n')
return
# fkat = wikipedia.argHandler(sys.argv[1], 'cht_cat')
# fzap = wikipedia.argHandler(sys.argv[2], 'cht_cat')
# fotch = wikipedia.argHandler(sys.argv[3], 'cht_cat')
fi = wikipedia.argHandler(sys.argv[1], 'cht_cat')
fkat = fi+'.txt'
fzap = fi
fotch = fi+'_ot.txt'
flotd=0
if len(sys.argv)>=3 and sys.argv[2]==u'2':
flotd=1
mysite = wikipedia.getSite()
f=codecs.open(fkat,'r',encoding='utf-8')
skon=u''
i=0
if os.access(fotch,os.F_OK):
otch = codecs.open(fotch, 'rb', 'utf-8')
for s in otch.readlines():
if ord(s[0]) == 65279:
s=s[1:]
s=s.replace(u'\r',u'')
while len(s)>0 and s[len(s)-1]==u'\n':
s=s[:len(s)-1]
if (s[0]==u'+' or s[0].isdigit()) and len(s)>=9:
s1=s[8:]
skon=s1
if s[0].isdigit():
i=0
j=0
while j<=5:
i=i*10+ord(s[j])-ord(u'0')
j+=1
i+=1
otch.close()
wikipedia.output(u'skon=%s\n i=%d' % (skon,i))
otch = codecs.open(fotch, 'a', 'utf-8')
flrab=0
t1=f.readlines()
if not flotd:
if skon==u'':
vertfile = codecs.open(u'%s_rez.txt' % fzap, 'w', 'utf-8')
else:
vertfile = codecs.open(u'%s_rez.txt' % fzap, 'a', 'utf-8')
j0=0
while j0<len(t1):
plm=[]
j=j0
while j<len(t1) and j<j0+64:
n=t1[j]
if ord(n[0]) == 65279:
n=n[1:]
n=ubr_nk_prob(n)
if n.startswith(u'[['):
n=n[2:]
if len(n)>2 and n[len(n)-2:]==u']]':
n=n[:len(n)-2]
wikipedia.output(u'... "%s" "%s"\n' % (n,skon))
if skon==u'' or flrab:
pl = wikipedia.Page(mysite,n)
plm.append(pl)
elif n==skon:
flrab=1
j+=1
try:
wikipedia.getall(mysite, plm)
except xmlreader.xml.sax._exceptions.SAXParseException:
wikipedia.getall(mysite, plm)
for pl in plm:
try:
t = pl.get()
if flotd:
vertfile = codecs.open(u'%s%06d.txt' % (fzap,i), 'w', 'utf-8')
vertfile.write(pl.title() + '\n')
vertfile.write(t + '\n')
if flotd:
vertfile.close()
else:
vertfile.write('================\n')
vertfile.flush()
b = u'%06d %s\n' % (i,pl.title())
wikipedia.output(b)
otch.write(b)
otch.flush()
i=i+1
except wikipedia.IsRedirectPage:
b = u'+ %s\n-> %s\n' % (pl.title(),pl.getRedirectTarget())
wikipedia.output(b)
otch.write(b)
otch.flush()
except wikipedia.NoPage:
b = u'- %s\n' % pl.title()
wikipedia.output(b)
otch.write(b)
otch.flush()
j0=j
if provstop(fstop):
wikipedia.output(u'stop!\n')
return
try:
main()
finally:
wikipedia.stopme()
try:
excludefile.close()
except:
pass
* cht_mn_sta.py
# -*- coding: UTF-8 -*-
__version__='$Id:'
import sys, codecs, re
import wikipedia, date, catlib
try:
# fkat = wikipedia.argHandler(sys.argv[1], 'cht_cat')
# fzap = wikipedia.argHandler(sys.argv[2], 'cht_cat')
# fotch = wikipedia.argHandler(sys.argv[3], 'cht_cat')
fi = wikipedia.argHandler(sys.argv[1], 'cht_cat')
fkat = fi+'.txt'
fzap = fi
fotch = fi+'_ot.txt'
mysite = wikipedia.getSite()
f=codecs.open(fkat,'r',encoding='utf-8')
otch = codecs.open(fotch, 'w', 'utf-8')
t1=f.readlines()
i=0
for line in t1:
if ord(line[0]) == 65279:
line=line[1:]
workingcatname = line
wikipedia.output(workingcatname)
pl = wikipedia.Page(mysite,wikipedia.UnicodeToAsciiHtml(workingcatname))
ple=pl.exists()
if ple:
try:
t = pl.get()
vertfile = codecs.open(u'%s%06d.txt' % (fzap,i), 'w', 'utf-8')
vertfile.write(pl.title() + '\n')
vertfile.write(t + '\n')
vertfile.flush()
vertfile.close()
b = u'%06d %s\n' % (i,pl.title())
wikipedia.output(b)
otch.write(b)
otch.flush()
i=i+1
except wikipedia.IsRedirectPage:
b = u'+ %s\n-> %s\n' % (pl.title(),pl.getRedirectTarget())
wikipedia.output(b)
otch.write(b)
otch.flush()
else:
b = u'- %s\n' % pl.title()
wikipedia.output(b)
otch.write(b)
otch.flush()
finally:
wikipedia.stopme()
try:
excludefile.close()
except:
pass
* cht_mn_cat.py
# -*- coding: UTF-8 -*-
__version__='$Id:'
import sys, codecs, re
import wikipedia, date, catlib
try:
fi = wikipedia.argHandler(sys.argv[1], 'cht_cat')
fkat = fi+u'.txt'
fzap = fi+u'_rez.txt'
# fkat = wikipedia.argHandler(sys.argv[1], 'cht_cat')
# fzap = wikipedia.argHandler(sys.argv[2], 'cht_cat')
f=codecs.open(fkat,'r',encoding='utf-8')
# f=open(fkat,'r') #,encoding='utf-8')
vertfile = codecs.open(fzap, 'w', 'utf-8')
mysite = wikipedia.getSite()
for s in f.readlines():
if ord(s[0]) == 65279:
s=s[1:]
s=s.replace(u'\r',u'')
while len(s)>0 and s[len(s)-1]==u'\n':
s=s[:len(s)-1]
wikipedia.output(s)
workingcat = catlib.Category(mysite,s)
vertfile.write( u'# %s\n' % s )
vertfile.flush()
list = workingcat.articles()
if list:
for pl in list:
vertfile.write( u'%s\n' % pl.title() )
vertfile.flush()
list = workingcat.subcategories()
if list:
for ka in list:
vertfile.write( u'%s\n' % ka.title() )
vertfile.flush()
finally:
wikipedia.stopme()
try:
excludefile.close()
except:
pass
* sozd_sta.py
# -*- coding: UTF-8 -*-
__version__='$Id:'
import wikipedia
import re, sys, os
import codecs
def ubr_nk_prob(t):
i=0
while i<len(t) and ((t[i]==u' ') or (t[i]==u'\n') or (t[i]==u'\t') or
(t[i]==u'\r')):
i+=1
j=len(t)-1
while j>i and ((t[j]==u' ') or (t[j]==u'\n') or (t[j]==u'\t') or
(t[j]==u'\r')):
j-=1
return t[i:j+1]
def sozd(otch,n,t,fldob):
n=ubr_nk_prob(n)
if n.startswith(u'[['):
n=n[2:]
if len(n)>2 and n[len(n)-2:]==u']]':
n=n[:len(n)-2]
pl = wikipedia.Page(mysite,n)
wikipedia.output(pl.title())
try:
st=pl.get()
except wikipedia.NoPage:
st=u''
except wikipedia.IsRedirectPage:
st=u'--> [['+pl.getRedirectTarget()+u']]\n----\n'
try:
if st!=u'' and not fldob:
wikipedia.output(u"Page %s already exists, not adding!"%pl.title())
otch.write(u'- %s\n' % pl.title())
otch.flush()
elif st!=u'' and fldob:
pl.put(st+u'\n\n'+t, comment = u"", minorEdit = False)
otch.write(u'++ %s\n' % pl.title())
otch.flush()
else:
pl.put(t, comment = u"", minorEdit = False)
otch.write(u'+ %s\n' % pl.title())
otch.flush()
return
except wikipedia.EditConflict:
wikipedia.output(u'!!!EditConflict!!!\n')
otch.write(u'! %s\n' % pl.title())
otch.flush()
def provstop(fstop):
return os.access(fstop,os.F_OK)
def main(filename,fotch,fldob):
fstop='sozd_sta_stop.txt'
try:
os.unlink(fstop)
except:
pass
if provstop(fstop):
wikipedia.output(u'stop0!\n')
return
t=u''
n=u''
f=codecs.open(filename,'rb',encoding='utf-8')
# skon=u''
nkon=0
if os.access(fotch,os.F_OK):
otch = codecs.open(fotch, 'rb', 'utf-8')
for s in otch.readlines():
if ord(s[0]) == 65279:
s=s[1:]
s=s.replace(u'\r',u'')
while len(s)>0 and s[len(s)-1]==u'\n':
s=s[:len(s)-1]
wikipedia.output(u'nnn\n')
s1=s[3:]
if len(s1)>=1:
# skon=s1
nkon+=1
otch.close()
# wikipedia.output(u'skon=%s\n' % skon)
otch = codecs.open(fotch, 'a', 'utf-8')
# flrab=0
i=0
sch=0
for s in f.readlines():
# wikipedia.output(u'%d\n' % ord(s[0]))
if ord(s[0]) == 65279:
s=s[1:]
# wikipedia.output(u'%d %s %s\n' % (i,n,s))
if s[0:7] == u'=======':
if i==1:
# wikipedia.output(u'... "%s" "%s"\n' % (n,skon))
wikipedia.output(u'... %d %d "%s"\n' % (sch,nkon,n))
# if skon==u'' or flrab:
# sozd(otch,n,t,fldob)
# elif n==skon:
# flrab=1
if sch>=nkon:
sozd(otch,n,t,fldob)
sch+=1
if provstop(fstop):
wikipedia.output(u'stop!\n')
return
i=0
t=u''
elif i==0:
n=ubr_nk_prob(s)
# while len(n)>0 and n[len(n)-1]==u'\n':
# n=n[:len(n)-1]
i=1
else:
t=t+s
# t.append(s)
# wikipedia.output(u"No title found - skipping a page.")
# text=''.join(text)
filename = wikipedia.argHandler(sys.argv[1], 'cht_cat')
fotch = filename+u'_ot.txt'
filename+=u'.txt'
fldob=0
if len(sys.argv)>=4 and sys.argv[2]==u'2':
fldob=1
mysite = wikipedia.getSite()
try:
main(filename,fotch,fldob)
except:
wikipedia.stopme()
raise
else:
wikipedia.stopme()
* sozd_alidir.py
# -*- coding: UTF-8 -*-
__version__='$Id:'
import wikipedia
import re, sys
import codecs
def sozd(n1,n2):
pl1 = wikipedia.Page(mysite,wikipedia.UnicodeToAsciiHtml(n1))
ple1= pl1.exists()
ba=u''
if ple1:
si1=u'+'
try:
pl1.get()
except wikipedia.IsRedirectPage:
ba = u'-> %s\n' % pl1.getRedirectTarget()
else:
si1=u'-'
ti1=pl1.title()
b=u'%s %s\n%s' % (si1,ti1,ba)
wikipedia.output(b)
otch.write(b)
otch.flush()
pl2 = wikipedia.Page(mysite,wikipedia.UnicodeToAsciiHtml(n2))
ple2= pl2.exists()
ba=u''
if ple2:
si2=u'+'
try:
pl2.get()
except wikipedia.IsRedirectPage:
ba = u'-> %s\n' % pl2.getRedirectTarget()
else:
si2=u'-'
ti2=pl2.title()
b=u'%s %s\n%s' % (si2,ti2,ba)
wikipedia.output(b)
otch.write(b)
otch.flush()
if ple1 & (ple2==0):
t=u'#REDIRECT [[%s]]' % ti1
pl2.put(t, comment = t, minorEdit = True)
wikipedia.output(t)
otch.write(u'%s\n' % t)
otch.flush()
elif (ple1==0) & ple2:
t=u'#REDIRECT [[%s]]' % ti2
pl1.put(t, comment = t, minorEdit = True)
wikipedia.output(t)
otch.write(u'%s\n' % t)
otch.flush()
return
def main():
n=u''
f=codecs.open(filename,'rb',encoding='utf-8')
i=0
for s in f.readlines():
# wikipedia.output(u'%d\n' % ord(s[0]))
if ord(s[0]) == 65279:
s=s[1:]
# wikipedia.output(u'%d %s %s\n' % (i,n,s))
if i==0:
n=s
i=1
else:
sozd(n,s)
i=0
# t.append(s)
# wikipedia.output(u"No title found - skipping a page.")
# text=''.join(text)
filename = wikipedia.argHandler(sys.argv[1], 'cht_cat')
fotch = wikipedia.argHandler(sys.argv[2], 'cht_cat')
for arg in sys.argv[3:]:
arg = wikipedia.argHandler(arg, 'pagefromfile')
if arg:
if arg=="-log":
import logger
sys.stdout = logger.Logger(sys.stdout, filename = 'pagefromfile.log')
mysite = wikipedia.getSite()
otch = codecs.open(fotch, 'w', 'utf-8')
try:
main()
except:
wikipedia.stopme()
raise
else:
wikipedia.stopme()
* sozd_ali1.py
# -*- coding: UTF-8 -*-
__version__='$Id:'
import wikipedia
import re, sys
import codecs
def prov(n1):
pl1 = wikipedia.Page(mysite,wikipedia.UnicodeToAsciiHtml(n1))
ple1= pl1.exists()
ba=u''
if ple1:
si1=u'+'
try:
pl1.get()
except wikipedia.IsRedirectPage:
ba = u'-> %s\n' % pl1.getRedirectTarget()
else:
si1=u'-'
ti1=pl1.title()
b=u'%s %s\n%s' % (si1,ti1,ba)
wikipedia.output(b)
otch.write(b)
otch.flush()
return (pl1,ple1,ti1)
def sozd(pl1,ple1,ti1,pl2,ple2,ti2):
if (ple1==0) & ple2:
t=u'#REDIRECT [[%s]]' % ti2
pl1.put(t, comment = t, minorEdit = True)
b=u'++ %s\n%s\n' % (ti1,t)
wikipedia.output(b)
otch.write(b)
otch.flush()
otch.write(u'\n')
otch.flush()
return
def main():
n=u''
pl2pr=None
ple2pr=0
ti2pr=u''
spr=u''
f=codecs.open(filename,'rb',encoding='utf-8')
i=0
for s in f.readlines():
# wikipedia.output(u'%d\n' % ord(s[0]))
if ord(s[0]) == 65279:
s=s[1:]
# wikipedia.output(u'%d %s %s\n' % (i,n,s))
if i==0:
n=s
i=1
else:
(pl1,ple1,ti1)=prov(n)
if spr==s:
pl2=pl2pr
ple2=ple2pr
ti2=ti2pr
else:
(pl2,ple2,ti2)=prov(s)
sozd(pl1,ple1,ti1,pl2,ple2,ti2)
spr=s
pl2pr=pl2
ple2pr=ple2
ti2pr=ti2
i=0
# t.append(s)
# wikipedia.output(u"No title found - skipping a page.")
# text=''.join(text)
filename = wikipedia.argHandler(sys.argv[1], 'cht_cat')
fotch = wikipedia.argHandler(sys.argv[2], 'cht_cat')
for arg in sys.argv[3:]:
arg = wikipedia.argHandler(arg, 'pagefromfile')
if arg:
if arg=="-log":
import logger
sys.stdout = logger.Logger(sys.stdout, filename = 'pagefromfile.log')
mysite = wikipedia.getSite()
otch = codecs.open(fotch, 'w', 'utf-8')
try:
main()
except:
wikipedia.stopme()
raise
else:
wikipedia.stopme()
* saveHTML1.py
# -*- coding: utf-8 -*-
"""
(C) 2004 Thomas R. Koll, <tomk32@tomk32.de>
Distribute under the terms of the PSF license.
This bot downloads the HTML-pages of articles and images
and saves the interesting parts, i.e. the article-text
and the footer to a file like Hauptseite.txt.
TODO:
change the paths in the HTML-file
Options:
-o: Specifies the output-directory where to save the files
-images: Downlaod all images
-overwrite:[I|A|B] Ignore existing Images|Article|Both and
download them even if the exist
Features, not bugs:
* Won't d/l images of an article if you set -overwrite:A
"""
__version__ = '$Id: saveHTML.py,v 1.10 2005/10/13 20:10:03 leogregianin Exp $'
import wikipedia,httplib,StringIO,re,sys,md5,os, string,codecs,urllib
from htmlentitydefs import *
def extractArticle(data):
""" takes a string with the complete HTML-file
and returns the article which is contained in
<div id='article'> and the pagestats which
contain information on last change """
images = []
s = StringIO.StringIO(data)
rPagestats = re.compile('.*(\<span id\=(\"|\')pagestats(\"|\')\>.*\<\/span\>).*')
rBody = re.compile('.*<div id\=\"content\">.*')
rFooter = re.compile('.*<div id\=\"footer\">.*')
rDivOpen = re.compile('.*<div ')
rDivClose = re.compile('.*<\/div>.*')
divLevel = 1
divLast = -1
inArticle = 0
inFooter = 0
result = {'article':"",
'footer':""}
for line in s:
if line == "<p><br /></p>":
continue
line = line.replace(" ", " ")
line = line.replace(" ", " ")
if rDivOpen.match(line):
divLevel = divLevel + 1
if rBody.match(line):
inArticle = 1
divLast = divLevel-2
elif rFooter.match(line):
divLast = divLevel-1
inFooter = 1
if inArticle:
result['article'] += line
elif inFooter:
result['footer'] += line
if rDivClose.match(line):
divLevel = divLevel - 1
if divLevel == divLast:
inArticle = 0
inFooter = 0
divLast = -1
return result
def html2txt(str):
dict = {"%C3%A4": "ä",
"%C3%B6": "ö",
"%C3%BC": "ü",
"%C3%84": "Ä",
"%C3%96": "Ö",
"%C3%9C": "Ü",
"%C3%9F": "ß",
"%27": "'",
"%28": "(",
"%29": ")",
"%2C": ","
}
for entry in dict:
str = re.sub(entry, dict[entry], str)
return str
def extractImages(data):
""" takes a string with the complete HTML-file
and returns the article which is contained in
<div id='article'> and the pagestats which
contain information on last change """
images = []
rImage = re.compile('<a href=[\r\n]*?"/wiki/.*?:(.*?)".*?[\r\n]*?.*?class=[\r\n]*?"image"', re.MULTILINE)
rThumb = re.compile('<a href=[\r\n]*?"/wiki/.*?:(.*?)".*?[\r\n]*?.*?class=[\r\n]*?"internal".*?[\r\n]*?.*?<img', re.MULTILINE or re.DOTALL)
last = ""
img = rImage.findall(data)
timg = rThumb.findall(data)
for i in timg:
try:
img.index(i)
except:
img.append(i)
print "Bilder: ", img
for image in img:
path = md5.new(html2txt(image)).hexdigest()
images.append( {'image': image,
'path' : str(path[0])+"/"+str(path[0:2])+"/"})
images.sort()
return images
def main():
mysite = wikipedia.getSite()
save_images = 0
fi = wikipedia.argHandler(sys.argv[1], 'cht_cat')
fkat = fi+'.txt'
fzap = fi
fotch = fi+'_ot.txt'
if len(sys.argv)>=3:
save_images = 1
headers = {"Content-type": "application/x-www-form-urlencoded",
"User-agent": "RobHooftWikiRobot/1.0"}
print "opening connection to", mysite.hostname(),
conn = httplib.HTTPConnection(mysite.hostname())
print " done"
R = re.compile('.*/wiki/(.*)')
f=codecs.open(fkat,'r',encoding='utf-8')
otch = codecs.open(fotch, 'wb', 'utf-8')
t1=f.readlines()
nomerf=0
for s in t1:
if ord(s[0]) == 65279:
s=s[1:]
s=s.replace(u'\r',u'')
if s[len(s)-1]==u'\n':
s=s[:len(s)-1]
if len(s)<1:
continue
# for article in sa:
# filename = article.replace("/", "_")
# filename = filename.replace(":", "_")
# if os.path.isfile(output_directory + filename + ".txt") and overwrite_articles == False:
# print "skipping " + article
# continue
data = ""
ua = s
enua = ua.replace(u' ', u'_').encode('utf-8')
enua = urllib.quote(enua)
url = '/wiki/'+ enua
conn.request("GET", url, "", headers)
response = conn.getresponse()
data = response.read()
if len(data) < 2:
b = u'- %s\n' % ua
wikipedia.output(b)
otch.write(b)
otch.flush()
# data = extractArticle(data)
f = open(u'%s%06d.htm' % (fzap,nomerf), 'wb')
# f.write(data['article'] + '\n' + data['footer'])
f.write(data)
f.flush()
f.close()
b = u'%06d %s\n' % (nomerf,ua)
wikipedia.output(b)
otch.write(b)
otch.flush()
nomerf+=1
if save_images:
# images = extractImages(data['article'])
images = extractImages(data)
for i in images:
# if overwrite_images == False and os.path.isfile(output_directory + i['image']):
# print "skipping existing " + i['image']
# continue
print 'downloading ' + i['image'],
uo = wikipedia.MyURLopener()
file = uo.open( "http://upload.wikimedia.org/wikipedia/"
+mysite.lang + '/' + i['path'] + i['image'])
content = file.read()
if (len(content) < 500):
uo.close()
print "downloading from commons",
uo = wikipedia.MyURLopener()
file = uo.open( "http://commons.wikimedia.org/upload/"
+ i['path'] + i['image'])
#print "http://commons.wikimedia.org/upload/", i['path'] , i['image'], file
content = file.read()
# f = open(output_directory + i['image'], "wb")
f = open(u'%s%06d_%s' % (fzap,nomerf,i['image']), 'wb')
f.write(content)
f.close()
b = u' +i %06d %db %s\n' % (nomerf,len(content),i['image'])
wikipedia.output(b)
otch.write(b)
otch.flush()
# print "\t\t", (len(content)/1024), "KB done"
conn.close()
if __name__ == "__main__":
main()
* podg_v_wiki.py
# -*- coding: UTF-8 -*-
__version__='$Id:'
import wikipedia
import re, sys
import codecs
def iskat(t,s):
lt=len(t)
ls=len(s)
i=0
for a in t[:lt-ls]:
if t[i:i+ls]==s:
return i
i+=1
return -1
def ubr_nk_prob(t):
if len(t)<1:
return t
while (t[0]==u' ') or (t[0]==u'\n') or (t[0]==u'\t') or (t[0]==u'\r'):
t=t[1:]
if len(t)<1:
return t
while (t[len(t)-1]==u' ') or (t[len(t)-1]==u'\n') or (t[len(t)-1]==u'\t') or (t[len(t)-1]==u'\r'):
t=t[:len(t)-1]
if len(t)<1:
return t
return t
def main(ft,fvih):
f=open(ft,'rb')
fperv=1
# rez=''
for s in f.readlines():
# if ord(s[0]) == 65279:
# s=s[1:]
s=s.replace('\r','')
if s[len(s)-1]=='\n':
s=s[:len(s)-1]
if len(s)<1:
continue
if s[0]=='p':
fi=ubr_nk_prob(s[2:])
f1=open(fi,'rb')
rez1=f1.read()
if fperv:
rez=rez1
rez=rez[:0]
rez+='* %s\n<code><pre><nowiki>\n'%fi
print type(rez)
print type(rez1)
rez+=rez1
print type(rez)
print type('
') rez+='</nowiki>'
- rez1='* %s\n
'\n'%fi+rez1+'
if fperv:
- rez=rez1
fperv=0
- else:
- rez+=rez1
f1.close()
fr=open(fvih, 'wb') fr.write(rez) fr.flush() fr.close()
if __name__ == "__main__":
try: ft = wikipedia.argHandler(sys.argv[1], 'perevod') fvih = wikipedia.argHandler(sys.argv[2], 'perevod')
main(ft,fvih) finally: wikipedia.stopme()
</nowiki>* zamen.py
# -*- coding: UTF-8 -*-
__version__='$Id:'
import wikipedia
import re, sys
import codecs
from bib_tekst import *
def vivod(b):
wikipedia.output(b)
otch.write(b)
otch.flush()
def sravn_opc(t,i,s,opc):
ls=len(s)
if i+ls>len(t):
return 0
k=0
while k<ls:
a=t[i+k]
b=s[k]
if (u'k' in opc) and ( (k>0 and (s[k-1] in u'[:')) or
(k==0 and (u'l' in opc)) ):
a=a.upper()
b=b.upper()
if a!=b:
return 0
k+=1
return 1
def iskat_opc(t,i,s0,opc):
ss=[s0]
if (u's' in opc) and len(s0)>2 and s0[len(s0)-2:]==u']]':
ss.append(s0[:len(s0)-2]+u'|')
lt=len(t)
while i<lt:
if (u'l' in opc) and (t[i:i+2]==u'[[' or t[i:i+2]==u'{{'):
pn=i+2
(pk,ik)=iskats_mn(t,pn,[u'[[',u']]',u'{{',u'}}',u'|',u'#'])
if pk==-1:
pk=lt
while pn<pk and (t[pn] in [u' ',u'\n',u'\t',u'\r']):
pn+=1
if pn<pk and t[pn]==u':' and s0[0]!=u':':
pn+=1
while pn<pk and (t[pk-1] in [u' ',u'\n',u'\t',u'\r']):
pk-=1
ssilk=t[pn:pk].replace(u'_',u' ').replace(u' ',u' ')
# ssilk=perv_upper(ubr_nk_prob(ssilk))
# vivod(u'%d %d %d %d %s %s\n'%(pn,pk,len(ssilk),len(s0),ssilk,s0))
if len(ssilk)==len(s0) and sravn_opc(ssilk,0,s0,opc):
return (pn,pk)
i=pk
for j in range(len(ss)):
s=ss[j]
ls=len(s)
ravn=sravn_opc(t,i,s,opc)
if ravn:
if j==1:
m=i+ls
m1=iskkonpodp(t,m,u'[[',u']]')
if m1!=-1:
return (i,m1+2)
return (i,i+ls)
i+=1
return (-1,-1)
def zam(n,tzfl,tzst,tzno,kol,komm):
pl = wikipedia.Page(mysite,wikipedia.UnicodeToAsciiHtml(n))
wikipedia.output(pl.title())
# pl._contents=u'qwe [[qwe]] [[qwer|qwe]] [[fqwe]] qwe :qwe {{wqwe}} [[qwe|qwer]] [qwe] sqwr dwer [[:qwe]] qwet dqwe gqwej [[enn:qwe]] [[qwey]]\n'
# pl._contents+=u'Qwe [[Qwe]] [[Qwer|Qwe]] [[fQwe]] Qwe :Qwe {{wQwe}} [[Qwe|Qwer]] [Qwe] sQwr dwer [[:Qwe]] Qwet dQwe gQwej [[enn:Qwe]] [[Qwey]]'
try:
nt = pl.get()
except wikipedia.NoPage:
b = u'- %s\n\n' % pl.title()
vivod(b)
return
except wikipedia.IsRedirectPage:
b = u'+ %s\n-> %s\n\n' % (pl.title(),pl.getRedirectTarget())
vivod(b)
return
otch.write(u'+ %s\n' % pl.title())
otch.flush()
otch2.write(u'=========================\n%s\n-=======\n%s\n' % (pl.title(),nt))
otch2.flush()
j=0
fliwi=1
while j<kol:
if (u'w' in tzfl[j]):
fliwi=0
j+=1
if fliwi:
oiwi = pl.interwiki()
niwi = {}
for pl2 in oiwi:
if pl.site()!=pl2.site():
niwi[pl2.site()] = pl2
nt = wikipedia.replaceLanguageLinks(nt, {})
nt = nt.replace(u'\r',u'')
# while u' \n' in nt:
# nt = nt.replace(u' \n',u'\n')
fzam=0
j=0
while j<kol:
zst=tzst[j]
zst=zst[:len(zst)-1]
zno=tzno[j]
zno=zno[:len(zno)-1]
# if mysite.lang==u'eo':
# zst=zst.replace(u'Ĉ',u'Cx').replace(u'ĉ',u'cx')
# zst=zst.replace(u'Ĝ',u'Gx').replace(u'ĝ',u'gx')
# zst=zst.replace(u'Ĵ',u'Jx').replace(u'ĵ',u'jx')
# zst=zst.replace(u'Ĥ',u'Hx').replace(u'ĥ',u'hx')
# zst=zst.replace(u'Ŝ',u'Sx').replace(u'ŝ',u'sx')
# zst=zst.replace(u'Ŭ',u'Ux').replace(u'ŭ',u'ux')
# zno=zno.replace(u'Ĉ',u'Cx').replace(u'ĉ',u'cx')
# zno=zno.replace(u'Ĝ',u'Gx').replace(u'ĝ',u'gx')
# zno=zno.replace(u'Ĵ',u'Jx').replace(u'ĵ',u'jx')
# zno=zno.replace(u'Ĥ',u'Hx').replace(u'ĥ',u'hx')
# zno=zno.replace(u'Ŝ',u'Sx').replace(u'ŝ',u'sx')
# zno=zno.replace(u'Ŭ',u'Ux').replace(u'ŭ',u'ux')
# if (u'u' in tzfl[j]) and (zno.lower() in nt.lower()):
if (u'u' in tzfl[j]):
(p,pk)=iskat_opc(nt,0,zno,tzfl[j])
if p!=-1:
b=u'uzxe\n'
wikipedia.output(b)
otch.write(b)
otch.flush()
j+=1
continue
# nt = nt.replace(zst,zno)
nz=0
p0=0
while (len(zno)>0 or len(zst)>0) and p0<len(nt):
if zst==u'':
if (u'n' in tzfl[j]):
p=0
else:
p=len(nt)
pk=p
else:
# p=iskats(nt,p0,zst)
(p,pk)=iskat_opc(nt,p0,zst,tzfl[j])
if p==-1:
break
nt=nt[:p]+zno+nt[pk:]
p0=p+len(zno)
nz+=1
fzam=1
if zst==u'':
break
b=u'%d\n' % nz
wikipedia.output(b)
otch.write(b)
otch.flush()
j+=1
if not fliwi:
pl1=wikipedia.Page(pl.site(),pl.title())
pl1._contents=nt
oiwi = pl1.interwiki()
niwi = {}
for pl2 in oiwi:
if pl.site()!=pl2.site():
niwi[pl2.site()] = pl2
nt = wikipedia.replaceLanguageLinks(nt, {})
if fzam:
while nt[len(nt)-1:]==u'\n':
nt=nt[:len(nt)-1]
# if fliwi:
nt+=wikipedia.replaceLanguageLinks(u'', niwi)
otch2.write(u'+=======\n%s\n' % nt)
otch2.flush()
pl.put(nt, comment = komm, minorEdit = False)
otch.write(u'\n')
otch.flush()
return
def main():
zfl=0
tzfl={}
tzst={}
tzno={}
komm=u''
n=[]
f=codecs.open(filename,'rb',encoding='utf-8')
i=0
j=0
for s in f.readlines():
# wikipedia.output(u'%d\n' % ord(s[0]))
if ord(s[0]) == 65279:
s=s[1:]
# wikipedia.output(u'%d %d %s %s\n' % (i,j,n,s))
s=s.replace(u'\r',u'')
if s[len(s)-1]!=u'\n':
s=s+u'\n'
if s[0:12] == u'============':
# wikipedia.output(u'k%d %d %s %s\n' % (i,j,n,s))
if i==100:
b=u'!er %s\n' % n
wikipedia.output(b)
otch.write(b)
otch.flush()
if (i!=100) and (j>0):
# wikipedia.output(u'kl%d %d %s %s\n' % (i,j,n,s))
for n1 in n:
zam(n1,tzfl,tzst,tzno,j,komm)
n=[]
komm=u''
i=0
j=0
elif i==0:
if s[0:8]==u'?=======':
i=2
elif s[0]==u'|':
komm=s[1:]
elif s[0]!=u'#':
n.append(s)
elif i==2:
zfl= (s[0]!=u'0')
if zfl:
tzfl[j]=s
tzst[j]=u''
tzno[j]=u''
j+=1
i=3
elif i==3:
if s[0:8]==u'-=======':
i=4
else:
i=100
wikipedia.output(u"er i==3\n")
elif i==4:
if s[0:8]==u'+=======':
i=5
else:
if zfl:
tzst[j-1]+=s
else: #if i==5:
if s[0:8]==u'?=======':
i=2
else:
if zfl:
tzno[j-1]+=s
# wikipedia.output(u"No title found - skipping a page.")
# text=''.join(text)
fi = wikipedia.argHandler(sys.argv[1], 'cht_cat')
filename = fi+'.txt'
fotch = fi+'_ot.txt'
fotch2 = fi+'_op.txt'
#filename = wikipedia.argHandler(sys.argv[1], 'cht_cat')
#fotch = wikipedia.argHandler(sys.argv[2], 'cht_cat')
#fotch2 = wikipedia.argHandler(sys.argv[3], 'cht_cat')
#fliwi=1
#if len(sys.argv)>=3 and sys.argv[2]==u'w':
# fliwi=0
otch = codecs.open(fotch, 'w', 'utf-8')
otch2 = codecs.open(fotch2, 'w', 'utf-8')
mysite = wikipedia.getSite()
try:
main()
except:
wikipedia.stopme()
raise
else:
wikipedia.stopme()
* perev_kat.py
# -*- coding: UTF-8 -*-
__version__='$Id:'
import wikipedia
import re, sys, os
import codecs
import perevod
from bib_tekst import *
from bib_kateg import *
from get_buf import *
def vivod(b):
wikipedia.output(b)
otch.write(b)
otch.flush()
def main(slov,vhjaz,vihjaz,fivh,firez):
f=codecs.open(fivh,'rb',encoding='utf-8')
frez=codecs.open(firez, 'w', 'utf-8')
ish=[]
for s in f.readlines():
if ord(s[0]) == 65279:
s=s[1:]
s=ubr_nk_prob(s)
ish.append(s)
f.close
ish.sort()
vihsi=wikipedia.getSite(code = vihjaz,fam = u'wikipedia')
spred=u''
for s in ish:
if s==spred:
continue
spred=s
st=s.split(u':',2)
if len(st)!=3:
vivod(u'!!! len(st)!=3 s=%s\n'%s)
continue
kato=[Kateg(st[0],st[2])]
nt_kat_sp=perev_kateg(slov,nssvoj,vihsi,kato,1,otch)
otch.write(u'\n\n===============\n\n')
otch.flush()
for t in nt_kat_sp:
frez.write(u'%s:%s\n%s\n\n'%(vihjaz,t,s))
frez.flush()
if __name__ == "__main__":
try:
vhjaz = wikipedia.argHandler(sys.argv[1], 'perevod')
vihjaz = wikipedia.argHandler(sys.argv[2], 'perevod')
fslov = wikipedia.argHandler(sys.argv[3], 'perevod')
fvh = wikipedia.argHandler(sys.argv[4], 'perevod')
fi = wikipedia.argHandler(sys.argv[5], 'perevod')
fvih= fi+u'.txt'
fotch = fi+u'-ot.txt'
# fotch2 = fi+u'-op.txt'
# fipoln = fi+u'-pp.txt'
# finesl = fi+u'-ns.txt'
# fink = fi+u'-nk.txt'
if vhjaz==u'-':
vhjaz=u''
otch = codecs.open(fotch, 'w', 'utf-8')
# otch2 = codecs.open(fotch2, 'w', 'utf-8')
slov={}
nssvoj={}
slovdop=[]
perevod.zagruzslov(slov,slovdop,nssvoj,vhjaz,vihjaz,fslov)
main(slov,vhjaz,vihjaz,fvh,fvih)
except:
wikipedia.stopme()
raise
else:
wikipedia.stopme()
* perev_oivi.py
# -*- coding: UTF-8 -*-
__version__='$Id:'
import wikipedia
import re, sys, os
import codecs
import perevod
from bib_tekst import *
from bib_kateg import *
def vivod(b):
wikipedia.output(b)
otch.write(b)
otch.flush()
def sozd(vhjaz,vihjaz,n,t):
vhsi=wikipedia.getSite(code = vhjaz,fam = u'wikipedia')
vihsi=wikipedia.getSite(code = vihjaz,fam = u'wikipedia')
pl=wikipedia.Page(vhsi,n)
pl._contents=t
oiwi = pl.interwiki()
s=u''
for pl2 in oiwi:
if pl2.site()==vihsi:
s=pl2.title()
if s==u'':
return u''
b=((u'|interwiki\n%s\n'+
u'?=======\n1wu\n-=======\n+=======\n\n[[%s:%s]]\n'+
u'============\n')%(s,vhjaz,n))
return b
def main(vhjaz,vihjaz,filename,firez):
t=u''
n=u''
f=codecs.open(filename,'rb',encoding='utf-8')
frez=codecs.open(firez, 'w', 'utf-8')
i=0
for s in f.readlines():
if ord(s[0]) == 65279:
s=s[1:]
if s[0:7] == u'=======':
if i==1:
rez=sozd(vhjaz,vihjaz,n,t)
if rez!=None:
frez.write(rez)
frez.flush()
i=0
t=u''
elif i==0:
n=ubr_nk_prob(s)
i=1
else:
t=t+s
if __name__ == "__main__":
try:
vhjaz = wikipedia.argHandler(sys.argv[1], 'perevod')
vihjaz = wikipedia.argHandler(sys.argv[2], 'perevod')
fvh = wikipedia.argHandler(sys.argv[3], 'perevod')
fi = wikipedia.argHandler(sys.argv[4], 'perevod')
fvih= fi+u'.txt'
# fotch = fi+u'-ot.txt'
# otch = codecs.open(fotch, 'w', 'utf-8')
main(vhjaz,vihjaz,fvh,fvih)
except:
wikipedia.stopme()
raise
else:
wikipedia.stopme()
* perev_sta.py
# -*- coding: UTF-8 -*-
__version__='$Id:'
import wikipedia
import re, sys, os
import codecs
import perevod
from bib_tekst import *
from bib_kateg import *
def vivod(b):
wikipedia.output(b)
otch.write(b)
otch.flush()
def prov_dobav_katcom1(nscom,zapriz,katcomsp,n):
n=perv_upper(ubr_nk_prob(n))
vivod(u' prov_dobav_katcom1 %s\n'%n)
p=iskats(n,0,u'#')
if p!=-1:
n=n[:p]
vivod(u' prov_dobav_katcom1# %s\n'%n)
t=u'Category:'+n
if nscom.has_key(t) and (not zapriz.has_key(n)):
t1=u'[['+t+u']]\n'
if not t1 in katcomsp:
katcomsp.append(t1)
vivod(u' <-- ++\n')
return 1
return 0
def prov_dobav_katcom(nscom,zapriz,katcomsp,vhs,jaz):
prov_dobav_katcom1(nscom,zapriz,katcomsp,vhs)
if jaz==u'en':
vhok=vhs[len(vhs)-1]
if vhok==u'y':
vhs1=vhs[:len(vhs)-1]+u'ies'
prov_dobav_katcom1(nscom,zapriz,katcomsp,vhs1)
else:
vhs1=vhs+u's'
prov_dobav_katcom1(nscom,zapriz,katcomsp,vhs1)
vhs1=vhs+u'es'
prov_dobav_katcom1(nscom,zapriz,katcomsp,vhs1)
def sozd_katcom(nscom,zapriz,nomio,kat):
katcomsp=[]
katprob=[]
for (n,jaz) in nomio:
katprob.append( (n,jaz) )
prov_dobav_katcom(nscom,zapriz,katcomsp,n,jaz)
# if len(katcomsp)<1:
# shcom_v_katcom(nscom,zapriz,katcomsp,shcomi+shcomo)
if len(katcomsp)<1:
vivod(u'- katcom nomio\n')
for ka in kat:
katprob.append( (ka.n,ka.jaz) )
prov_dobav_katcom(nscom,zapriz,katcomsp,ka.n,ka.jaz)
if len(katcomsp)<1:
vivod(u'- katcom\n')
for (n1,jaz) in katprob:
n2=n1.replace(u'(',u' ').replace(u')',u' ')
for n in n2.split(u' '):
if len(n)>=4:
prov_dobav_katcom(nscom,zapriz,katcomsp,n,jaz)
if len(katcomsp)<1:
vivod(u'- katcom2\n')
katcom=u''
for t in katcomsp:
katcom+=t
vivod(u'========katcom=\n%s==========\n' % katcom)
return katcom
def prov_te_format_kart(t):
minrzm=50
t=ubr_nk_prob(t)
if (t==u'thumb') or (t==u'thumbnail') or (t==u'frame') or (t==u'framed'):
return 1
if (t==u'left') or (t==u'center') or (t==u'right'):
return 1
if len(t)<3:
return 0
x=0
i=0
while i<6 and i<len(t) and t[i].isdigit():
x=x*10+ord(t[i])-ord(u'0')
i+=1
if (i>=1) and (t[i:]==u'px'):
if x<minrzm:
return 2
return 1
return 0
class Pereved_sta:
def __init__(self, vhpn,n,t,statn,stat):
self.vhpn = vhpn
self.n = n
self.n2 = n
self.t = t
self.nkat = 0
self.statn = statn
self.stat = stat
self.prioritet = 0
self.npredl = 0
def vz_prioritet(self):
return self.prioritet
def sum_per_stat(s,s1):
if s1.flp:
s.flp = 1
s.nup += s1.nup
s.nnp += s1.nnp
s.nvap += s1.nvap
s.snp += s1.snp
s.svap += s1.svap
def sozd(slov,nscom,zapriz,sp_obr_st,vhjaz,vihjaz,n,t,snka,sniz,snizkat,
schss,tpref,
fl_d_iwi_n,fl_b_tekst):
vivod(u'\n\n\n%s\n'%n)
vhpn=vhjaz+u':'+n
if fl_d_iwi_n and not sp_obr_st.has_key(vhpn):
vivod(u'<- net v tbl\n')
return
if slov.has_key((vhjaz,n)) and slov[(vhjaz,n)].riwi:
vivod(u'<- uzxe v real ivi\n')
return
vhsi=wikipedia.getSite(code = vhjaz,fam = u'wikipedia')
vihsi=wikipedia.getSite(code = vihjaz,fam = u'wikipedia')
vhprefi=vhsi.family.image_namespace(vhjaz)
vihprefi=vihsi.family.image_namespace(vihjaz)
pl=wikipedia.Page(vhsi,n)
pl._contents=t
nomio=[(n,vhjaz)]
oiwi = pl.interwiki()
niwi = {}
for pl2 in oiwi:
if pl2.site()==vihsi:
vivod(u'uzxe vihjaz %s %s\n'%(n,pl2.title()))
return
if pl2.site()!=vhsi:
niwi[pl2.site()] = pl2
nomio.append( (pl2.title(),pl2.site().lang) )
niwi[vhsi]=pl
plkat = pl.categories()
kato=[]
for pka in plkat:
kato.append(Kateg(vhjaz,pka.title(),1))
t = wikipedia.removeLanguageLinks(t)
t = wikipedia.removeCategoryLinks(t,vhsi)
rezn_t=perevod.perevod_iwi_spis(slov,vhjaz,vihjaz,n)
if len(rezn_t)>0:
fl_iwi_n=1
rezn=rezn_t[0]
statn=perevod.Perev_stat()
else:
fl_iwi_n=0
(rezn,statn)=perevod.perevod_stat(slov,vhjaz,vihjaz,n)
vihpl=wikipedia.Page(vihsi,rezn)
t = t.replace(u'\r',u' ').replace(u'\t',u' ')
while u' ' in t:
t = t.replace(u' ',u' ')
t = t.replace(u' \n',u'\n')
tks=[u'. ',u'.\n',u'\n\n',u'\n ',u'\n:',u'\n*',u'[[',u'{{',u'\n=',u'=\n',
u'<gallery>',u'<math>',u']]',]
tkzs={u'[[':u']]',u'{{':u'}}',u'<gallery>':u'</gallery>',
u'<math>':u'</math>'}
st=[]
rt=u''
sst=[]
uktbl=[]
npredl=0
snizdl0=len(sniz)
p=0
while (not fl_b_tekst) and p<len(t):
(p1,i)=iskats_mn(t,p,tks)
# vivod(u'nach predl %d %d\n'%(p1,i))
if p1==-1:
p1=len(t)
# elif tks[i][0]==u'.' or tks[i][0]==u'=':
# else:
elif not tkzs.has_key(tks[i]):
p1+=1
rt+=t[p:p1].replace(u'[[',u' ').replace(u']]',u' ')
kon_predl=1
ss_vih=u''
if tks[i]==u'[[':
os=u'[['
zs=u']]'
p2=iskkonpodp(t,p1+len(os),os,zs)
if p2==-1:
p2=len(t)
(pr,ir)=iskats_mn(t,p1+len(os),[u'|',os,zs])
pnt=p1+len(os)
pks=p2
if pr!=-1 and ir==0:
pnt=pr+1
pks=pr
ss=ubr_nk_prob(t[p1+len(os):pks])
ss_pdt=iskats(ss,1,u':')
if ss_pdt==-1:
kon_predl=0
perss=perevod.perevod_iwi_spis(slov,vhjaz,vihjaz,ss)
p3=p2+len(zs)
p4=p3
if len(perss)>=1:
while p4<len(t) and t[p4].isalpha():
p4+=1
uktbl.append(len(rt))
rt+=( (t[pnt:p2]+t[p3:p4]).
replace(u'[[',u' ').
replace(u']]',u' ')
)
uktbl.append(len(rt))
sst.append(perss[0])
if schss.has_key(perss[0]):
schss[perss[0]]+=1
else:
schss[perss[0]]=1
else:
rt+=t[pnt:p2].replace(u'[[',u' ').replace(u']]',u' ')
p=p4
elif RBR(vhprefi,ss[:ss_pdt]) or RBR(u'Image',ss[:ss_pdt]):
ss=ss[ss_pdt+1:]
ss=perv_upper(ubr_nk_prob(ss.replace(u'_',u' ')))
if nscom.has_key(u'Image:'+ss):
ss_vih=vihprefi+u':'+ss
else:
ss_vh=vhprefi+u':'+ss
perss=perevod.perevod_iwi_spis(slov,vhjaz,vihjaz,ss_vh)
if len(perss)>=1:
ss_vih=perss[0]
else:
b=u'[['+vhjaz+u':'+ss_vh+u']]'
if not b in sniz:
sniz.append(b)
snizkat.append(u'')
if kon_predl:
# st.append(rt)
while u' ' in rt:
rt = rt.replace(u' ',u' ')
if len(rt)>=20:
npredl+=1
(r1,s1)=perevod.perevod_stat(slov,vhjaz,vihjaz,rt,uktbl=uktbl)
qi=len(sst)-1
while qi>=0:
ss=sst[qi]
q1=uktbl[qi*2]
q2=uktbl[qi*2+1]
if perv_upper(ss)==perv_upper(r1[q1:q2]):
r1=r1[:q1]+u'[['+r1[q1:q2]+u']]'+r1[q2:]
else:
r1=r1[:q1]+u'[['+ss+u'|'+r1[q1:q2]+u']]'+r1[q2:]
qi-=1
st.append((r1,s1))
rt=u''
sst=[]
uktbl=[]
if tkzs.has_key(tks[i]):
os=tks[i]
zs=tkzs[tks[i]]
p2=iskkonpodp(t,p1+len(os),os,zs)
if p2==-1:
p=len(t)
else:
p=p2+len(zs)
if tks[i]==u'<math>':
st.append((t[p1:p],perevod.Perev_stat()))
elif ss_vih!=u'':
pa=p1+len(os)
while 1:
(pr,ir)=iskats_mn(t,pa,[u'|',zs])
if pr==-1:
pr=len(t)
if pa==p1+len(os) or prov_te_format_kart(t[pa:pr]):
pa=pr
if pa<len(t) and t[pa]==u'|':
pa+=1
continue
break
st.append((u'[['+ss_vih+u'|'+t[pnt:pa],
perevod.Perev_stat()))
p=pa
else:
p=p1
rez=u''
stat=perevod.Perev_stat()
for (r1,s1) in st:
rez+=r1
sum_per_stat(stat,s1)
while u' ' in rez:
rez = rez.replace(u' ',u' ')
while u'\n\n\n' in rez:
rez = rez.replace(u'\n\n\n',u'\n\n')
rez=ubr_nk_prob(rez)
nt_kat_sp=perev_kateg(slov,nssvoj,vihsi,kato,0,otch)
if len(nt_kat_sp)<1:
for ka in kato:
jaz=ka.jaz
issite=wikipedia.getSite(jaz,fam = pl.site().family)
iskatprefi=issite.family.category_namespace(jaz)
b=jaz+u':'+iskatprefi+u':'+ka.n
if not b in snka:
snka.append(b)
nt_kat=kateg_v_tekst(nt_kat_sp)
rez+=u'\n\n'+wikipedia.replaceLanguageLinks(nt_kat, niwi, site=vihsi)
ps=Pereved_sta(vhpn,rezn,rez,statn,stat)
ps.nkat=len(nt_kat_sp)
osnp={}
for a in statn.snp:
osnp[a]=1
for a in stat.snp:
# vivod(u'%s\n'%a)
# if not osnp.has_key(a):
osnp[a]=1
if snizdl0<len(sniz):
katcom=sozd_katcom(nscom,zapriz,nomio,kato)
katcom=katcom.replace(u'\n',u' ')
i=snizdl0
while i<len(sniz):
snizkat[i]=katcom
i+=1
ps.npredl=npredl
prioritet = (len(osnp)+statn.nvap)*10000+stat.nvap*10
if not fl_b_tekst:
if npredl<3:
prioritet+=100000000
else:
prioritet/=npredl
if tpref!=u'':
ps.t=u'{{polurinda movu|%s}}\n'%rezn+ps.t
ps.n2=tpref+ps.n
else:
ps.t=u'{{polurinda}}\n'+ps.t
if len(nt_kat_sp)<1:
prioritet+=10000000
if fl_d_iwi_n and not fl_iwi_n:
prioritet+=200000000
ps.prioritet=prioritet
vivod(u'%d %d %s\n\n'%(ps.prioritet,ps.npredl,n))
return ps
def main(slov,nscom,zapriz,sp_obr_st,vhjaz,vihjaz,
filename,firez,finesl,finka,finiz,fiuzxe,fipref,
fl_d_iwi_n,fl_b_tekst,fl_schss):
t=u''
n=u''
f=codecs.open(filename,'rb',encoding='utf-8')
vivod(u'fipref=%s\n'%fipref)
tpref=u''
if fipref!=u'.':
fpref=codecs.open(fipref, 'rb', 'utf-8')
s=fpref.readlines()[0]
fpref.close()
if len(s)>0 and ord(s[0]) == 65279:
s=s[1:]
tpref=ubr_nk_prob(s)
vivod(u'tpref=%s\n'%tpref)
frez=codecs.open(firez, 'w', 'utf-8')
fnesl=codecs.open(finesl, 'w', 'utf-8')
fnka=codecs.open(finka, 'w', 'utf-8')
fniz=codecs.open(finiz, 'w', 'utf-8')
fuzxe=codecs.open(fiuzxe, 'w', 'utf-8')
tps=[]
snka=[]
sniz=[]
snizkat=[]
schss={}
i=0
for s in f.readlines():
# wikipedia.output(u'%d\n' % ord(s[0]))
if ord(s[0]) == 65279:
s=s[1:]
# wikipedia.output(u'%d %s %s\n' % (i,n,s))
if s[0:7] == u'=======':
if i==1:
rez=sozd(slov,nscom,zapriz,sp_obr_st,vhjaz,vihjaz,
n,t,snka,sniz,snizkat,
schss,tpref,fl_d_iwi_n,fl_b_tekst)
if rez!=None:
tps.append(rez)
i=0
t=u''
elif i==0:
n=s
while len(n)>0 and n[len(n)-1]==u'\n':
n=n[:len(n)-1]
i=1
else:
t=t+s
for t in snka:
fnka.write(u'%s\n'%t)
fnka.flush()
for i in range(len(sniz)):
fniz.write(u'%s %s\n'%(sniz[i],snizkat[i]))
fniz.flush()
if fl_schss==1:
for tp in tps:
if schss.has_key(tp.n):
tp.prioritet=-schss[tp.n]
else:
tp.prioritet=1000
elif fl_schss==2:
for tp in tps:
tp.prioritet=-tp.nkat
# tps.sort(key=Pereved_sta.vz_prioritet,reverse=True)
tps.sort(key=Pereved_sta.vz_prioritet)
osnpo={}
osvapo={}
kolotch=50
i=0
while i<len(tps):
pr0=tps[i].prioritet
osnp={}
osvap={}
osnp1={}
osvap1={}
j=i
while j<len(tps) and pr0==tps[j].prioritet:
frez.write(u'%s\n%s\n========\n' % (tps[j].n2,tps[j].t))
frez.flush()
fuzxe.write(u'%s\n' % tps[j].vhpn)
fuzxe.flush()
perevod.perev_uch_nezsl(osnp,osvap,tps[j].statn)
perevod.perev_uch_nezsl(osnp,osvap,tps[j].stat)
for sl in osnp.iterkeys():
if not osnpo.has_key(sl):
osnp1[sl]=osnp[sl]
for sl in osvap.iterkeys():
if not osvapo.has_key(sl):
osvap1[sl]=osvap[sl]
perevod.perev_uch_nezsl(osnpo,osvapo,tps[j].statn)
perevod.perev_uch_nezsl(osnpo,osvapo,tps[j].stat)
j+=1
fnesl.write(u'prior=%d n=%d\n========\n' % (pr0,j-i))
fnesl.flush()
perevod.perev_pech_nezsl(slov,vhjaz,vihjaz,osnp1,osvap1,fnesl,kolotch)
fnesl.write(u'=========================\n')
fnesl.flush()
i=j
fnesl.write(u'=========================*=*=*=*=\n')
fnesl.flush()
kolotch2=1000
perevod.perev_pech_nezsl(slov,vhjaz,vihjaz,osnpo,osvapo,fnesl,kolotch2)
fnesl.write(u'\n\n')
fnesl.flush()
ssnp=[]
for t, n in osnpo.iteritems():
ssnp.append((n,t))
ssnp.sort(reverse=True)
i=0
for n, t in ssnp:
if len(t)>=4 and t.startswith(u'Un'):
t1=t[2:]
# prist=u'Mal'
prist=u'Ne'
elif len(t)>=4 and (t.startswith(u'Ill') or
t.startswith(u'Imb') or
t.startswith(u'Imm') or
t.startswith(u'Imp') or
t.startswith(u'Irr') or
t.startswith(u'In') ):
t1=t[2:]
prist=u'Ne'
else:
continue
(t2,stat2)=perevod.perevod_stat(slov,vhjaz,vihjaz,t1)
if len(t2)>=6 and t2[0]==u'(' and t2[len(t2)-1]==u')':
t2=t2[1:len(t2)-1]
t2t=t2.split(u', ')
t2r=u''
i=0
while i<len(t2t):
t2r+=prist+t2t[i]
i+=1
if i>=len(t2t):
break
t2r+=u', '
fnesl.write(u'%s : %s\n'%(t,t2r))
fnesl.flush()
if __name__ == "__main__":
try:
vhjaz = wikipedia.argHandler(sys.argv[1], 'perevod')
vihjaz = wikipedia.argHandler(sys.argv[2], 'perevod')
fslov = wikipedia.argHandler(sys.argv[3], 'perevod')
fvh = wikipedia.argHandler(sys.argv[4], 'perevod')
fipref = wikipedia.argHandler(sys.argv[5], 'perevod')
fi = wikipedia.argHandler(sys.argv[6], 'perevod')
fvih= fi+u'.txt'
fotch = fi+u'-ot.txt'
fipoln = fi+u'-pp.txt'
finesl = fi+u'-ns.txt'
finka = fi+u'-nk.txt'
finiz = fi+u'-iz.txt'
fiuzxe = fi+u'-uz.txt'
fl_d_iwi_n=0
if len(sys.argv)>=8 and (u'w' in sys.argv[7]):
fl_d_iwi_n=1
fl_b_tekst=0
if len(sys.argv)>=8 and (u'k' in sys.argv[7]):
fl_b_tekst=1
fl_schss=0
if len(sys.argv)>=8 and (u's' in sys.argv[7]):
fl_schss=1
elif len(sys.argv)>=8 and (u'c' in sys.argv[7]):
fl_schss=2
otch = codecs.open(fotch, 'w', 'utf-8')
#otch2 = codecs.open(fotch2, 'w', 'utf-8')
slov={}
nssvoj={}
nscom={}
zapriz={}
slovdop={}
perevod.zagruzslov(slov,slovdop,nssvoj,vhjaz,vihjaz,fslov)
if slovdop.has_key(u'c'):
nscom=slovdop[u'c']
if slovdop.has_key(u'z'):
zapriz=slovdop[u'z']
sp_obr_st={}
if slovdop.has_key(u'spst'):
sp_obr_st=slovdop[u'spst']
sp_obr_uzxe={}
if slovdop.has_key(u'pe_uzxe'):
sp_obr_uzxe=slovdop[u'pe_uzxe']
for t in sp_obr_uzxe:
if sp_obr_st.has_key(t):
del sp_obr_st[t]
main(slov,nscom,zapriz,sp_obr_st,vhjaz,vihjaz,
fvh,fvih,finesl,finka,finiz,fiuzxe,fipref,
fl_d_iwi_n,fl_b_tekst,fl_schss)
except:
wikipedia.stopme()
raise
else:
wikipedia.stopme()
* perev_tit.py
# -*- coding: UTF-8 -*-
__version__='$Id:'
import wikipedia
import re, sys
import codecs
import perevod
def iskat(t,s):
lt=len(t)
ls=len(s)
i=0
while i<=lt-ls:
if t[i:i+ls]==s:
return i
i+=1
return -1
def iskats_mn(t,p0,ss):
i=p0
while i<len(t):
for j in range(len(ss)):
if t[i].startswith(ss[j]):
return (i,j)
i+=1
return (-1,0)
def perv_upper(t):
if len(t)<1:
return u''
return t[0].upper()+t[1:]
def ubr_nk_prob(t):
i=0
while i<len(t) and ((t[i]==u' ') or (t[i]==u'\n') or (t[i]==u'\t') or
(t[i]==u'\r')):
i+=1
j=len(t)-1
while j>i and ((t[j]==u' ') or (t[j]==u'\n') or (t[j]==u'\t') or
(t[j]==u'\r')):
j-=1
return t[i:j+1]
def main(slov,nssvoj,vhjaz,vihjaz,fvh,fvih,fipoln,finesl,fispit,flvsjo):
otch.write(u'len(nssvoj)=%d\n'%len(nssvoj))
otch.flush()
kolotch=1500
f0=codecs.open(fvh,'rb',encoding='utf-8')
f1=codecs.open(fvih, 'w', 'utf-8')
fpoln=codecs.open(fipoln, 'w', 'utf-8')
fnesl=codecs.open(finesl, 'w', 'utf-8')
fspit=codecs.open(fispit, 'w', 'utf-8')
osnp={}
osvap={}
for s in f0.readlines():
# wikipedia.output(u'%d\n' % ord(s[0]))
if ord(s[0]) == 65279:
s=s[1:]
# wikipedia.output(u'%d %d %s %s\n' % (i,j,n,s))
s=s.replace(u'\r',u'').replace(u'_',u' ')
# if s[len(s)-1]!=u'\n':
# s=s+u'\n'
s=ubr_nk_prob(s)
# (rez,fl_perev)=perevod(slov,vhjaz,vihjaz,s)
(rez,stat)=perevod.perevod_stat(slov,vhjaz,vihjaz,s)
if flvsjo or (stat.nnp==0 and stat.nvap==0):
f1.write(s+u'|'+rez+u'\n')
f1.flush()
if nssvoj.has_key(rez):
otch.write(rez+u'\n')
otch.write(u' <- uzxe\n')
otch.flush()
else:
fspit.write(s+u'\n')
fspit.flush()
fpoln.write(rez+u'\n')
fpoln.flush()
perevod.perev_uch_nezsl(osnp,osvap,stat)
perevod.perev_pech_nezsl(slov,vhjaz,vihjaz,osnp,osvap,fnesl,kolotch)
if __name__ == "__main__":
try:
vhjaz = wikipedia.argHandler(sys.argv[1], 'perevod')
vihjaz = wikipedia.argHandler(sys.argv[2], 'perevod')
fslov = wikipedia.argHandler(sys.argv[3], 'perevod')
fvh = wikipedia.argHandler(sys.argv[4], 'perevod')
fi = wikipedia.argHandler(sys.argv[5], 'perevod')
fvih= fi+u'.txt'
fotch = fi+u'-ot.txt'
fipoln = fi+u'-pp.txt'
finesl = fi+u'-ns.txt'
fispit = fi+u'-st.txt'
flvsjo=0
if len(sys.argv)>=7 and sys.argv[6]==u'2':
flvsjo=1
otch = codecs.open(fotch, 'w', 'utf-8')
#otch2 = codecs.open(fotch2, 'w', 'utf-8')
slov={}
nssvoj={}
slovdop={}
perevod.zagruzslov(slov,slovdop,nssvoj,vhjaz,vihjaz,fslov)
main(slov,nssvoj,vhjaz,vihjaz,fvh,fvih,fipoln,finesl,fispit,flvsjo)
finally:
wikipedia.stopme()
* vich_slivi.py
# -*- coding: UTF-8 -*-
__version__='$Id:'
import wikipedia
import re, sys
import codecs
from bib_tekst import *
def vivod(b):
wikipedia.output(b)
otch.write(b)
otch.flush()
def main():
n=[]
f=codecs.open(filename,'rb',encoding='utf-8')
f1=codecs.open(fi1,'rb',encoding='utf-8')
global otch
otch = codecs.open(fotch, 'w', 'utf-8')
frez = codecs.open(firez, 'w', 'utf-8')
ns={}
for s in f1.readlines():
if ord(s[0]) == 65279:
s=s[1:]
s=ubr_nk_prob(s)
if len(s)>0 and not s.startswith(vhjaz+u':'):
ns[s]=1
sch={}
for s in f.readlines():
if ord(s[0]) == 65279:
s=s[1:]
s=ubr_nk_prob(s)
if ns.has_key(s):
vivod(u'- %s\n'%s)
else:
frez.write(s+u'\n')
frez.flush()
#fi = wikipedia.argHandler(sys.argv[1], 'cht_cat')
#filename = fi+'.txt'
#fotch2 = fi+'_op.txt'
vhjaz = wikipedia.argHandler(sys.argv[1], 'cht_cat')
fi1 = wikipedia.argHandler(sys.argv[2], 'cht_cat')
filename = wikipedia.argHandler(sys.argv[3], 'cht_cat')
fi = wikipedia.argHandler(sys.argv[4], 'cht_cat')
firez = fi+'.txt'
fotch = fi+'_ot.txt'
#otch2 = codecs.open(fotch2, 'w', 'utf-8')
mysite = wikipedia.getSite()
try:
main()
except:
wikipedia.stopme()
raise
else:
wikipedia.stopme()
* zamen_slivi.py
# -*- coding: UTF-8 -*-
__version__='$Id:'
import wikipedia
import re, sys
import codecs
from bib_tekst import *
def vivod(b):
wikipedia.output(b)
otch.write(b)
otch.flush()
def main():
n=[]
f=codecs.open(filename,'rb',encoding='utf-8')
f1=codecs.open(fi1,'rb',encoding='utf-8')
global otch
otch = codecs.open(fotch, 'w', 'utf-8')
frez = codecs.open(firez, 'w', 'utf-8')
ns={}
for s in f1.readlines():
if ord(s[0]) == 65279:
s=s[1:]
s=ubr_nk_prob(s)
st=s.split(u'|')
if len(st)==2:
ns[vhjaz+u':'+ubr_nk_prob(st[0])]=vhjaz+u':'+ubr_nk_prob(st[1])
sch={}
for s in f.readlines():
if ord(s[0]) == 65279:
s=s[1:]
s=ubr_nk_prob(s)
i=0
while ns.has_key(s):
if sch.has_key(s):
sch[s]+=1
else:
sch[s]=1
s=ns[s]
i=i+1
if i>=10:
vivod(u'!cikl %s\n'%s)
break
frez.write(s+u'\n')
frez.flush()
for ss,sn in ns.iteritems():
n=0
if sch.has_key(ss):
n=sch[ss]
vivod(u'%d %s | %s\n'%(n,ss,sn))
#fi = wikipedia.argHandler(sys.argv[1], 'cht_cat')
#filename = fi+'.txt'
#fotch2 = fi+'_op.txt'
vhjaz = wikipedia.argHandler(sys.argv[1], 'cht_cat')
filename = wikipedia.argHandler(sys.argv[2], 'cht_cat')
fi1 = wikipedia.argHandler(sys.argv[3], 'cht_cat')
fi = wikipedia.argHandler(sys.argv[4], 'cht_cat')
firez = fi+'.txt'
fotch = fi+'_ot.txt'
#otch2 = codecs.open(fotch2, 'w', 'utf-8')
mysite = wikipedia.getSite()
try:
main()
except:
wikipedia.stopme()
raise
else:
wikipedia.stopme()
* podg_mma_f0_v2.py
# -*- coding: UTF-8 -*-
__version__='$Id:'
import wikipedia
import re, sys
import codecs
import xmlreader
from bib_tekst import *
def ubrkoment (text):
# Ignore tekst within nowiki tags and HTML comments
nowikiOrHtmlCommentR = re.compile(r'<nowiki>.*?|<!--.*?-->', re.IGNORECASE | re.DOTALL)
match = nowikiOrHtmlCommentR.search(text)
while match:
text = text[:match.start()] + text[match.end():]
match = nowikiOrHtmlCommentR.search(text)
return text
def korr_im (jaz,t,pref):
for p in pref:
if t.startswith(p):
t=t[len(p):]
break
t=perv_upper(t)
if u':' in t:
if ( (not t.startswith(u'Category:')) and
(not t.startswith(u'Image:')) ):
katprefi=mysite.family.category_namespace(jaz)
if not t.startswith(katprefi+u':'):
izprefi=mysite.family.image_namespace(jaz)
if not t.startswith(izprefi+u':'):
return u''
return t
iskkat=[
u'[[Kategorio:Naskiĝ',
# u'[[kategorio:Naskiĝ',
u'[[Kategorio:naskiĝ',
# u'[[kategorio:naskiĝ',
u'[[Kategorio:Mort',
# u'[[kategorio:Mort',
u'[[Kategorio:mort',
# u'[[kategorio:mort',
]
def main(vhjaz,fvh,fvih,fipref):
tzfl=0
tzst={}
tzno={}
pref=[]
if fipref!=u'':
fpref=codecs.open(fipref,'rb',encoding='utf-8')
for s in fpref.readlines():
if ord(s[0]) == 65279:
s=s[1:]
s=s.replace(u'\r',u'')
if s[len(s)-1]==u'\n':
s=s[:len(s)-1]
pref.append(s)
fpref.close()
n=u''
# f0=codecs.open(fvh,'rb',encoding='utf-8')
f1=codecs.open(fvih, 'w', 'utf-8')
insite=wikipedia.getSite(vhjaz,fam = u'wikipedia')
tblredir = {}
# open xml dump and read page titles out of it
dump = xmlreader.XmlDump(fvh)
redirR = wikipedia.getSite().redirectRegex()
readPagesCount = 0
for entry in dump.parse():
readPagesCount += 1
# always print status message after 1000 pages
if readPagesCount % 1000 == 0:
print '%i pages read...' % readPagesCount
m = redirR.search(entry.text)
if (not m) and (not u':' in entry.title):
t=entry.text.replace(u'[[ ',u'[[').replace(u': ',u':')
t=t.replace(u'[[k',u'[[K')
dop=0
for isk in iskkat:
if isk in t:
dop=1
break
if dop:
b=u'%s\n' % entry.title
f1.write(b)
f1.flush()
vhjaz = u'eo'
#vhjaz = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvh = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvih = wikipedia.argHandler(sys.argv[2], 'slov_iz_xml')
fpref=u''
#fotch = wikipedia.argHandler(sys.argv[4], 'slov_iz_xml')
#mysite = wikipedia.getSite()
#otch = codecs.open(fotch, 'w', 'utf-8')
mysite = wikipedia.getSite()
try:
main(vhjaz,fvh,fvih,fpref)
except:
wikipedia.stopme()
raise
else:
wikipedia.stopme()
</nowiki>
* podg_mma_f1.py
# -*- coding: UTF-8 -*-
__version__='$Id:'
import wikipedia
import re, sys
import codecs
import xmlreader
from bib_tekst import *
def ubrkoment (text):
# Ignore tekst within nowiki tags and HTML comments
nowikiOrHtmlCommentR = re.compile(r'<nowiki>.*?|<!--.*?-->', re.IGNORECASE | re.DOTALL)
match = nowikiOrHtmlCommentR.search(text)
while match:
text = text[:match.start()] + text[match.end():]
match = nowikiOrHtmlCommentR.search(text)
return text
def korr_im (jaz,t,pref):
for p in pref:
if t.startswith(p):
t=t[len(p):]
break
t=perv_upper(t)
if u':' in t:
if ( (not t.startswith(u'Category:')) and
(not t.startswith(u'Image:')) ):
katprefi=mysite.family.category_namespace(jaz)
if not t.startswith(katprefi+u':'):
izprefi=mysite.family.image_namespace(jaz)
if not t.startswith(izprefi+u':'):
return u''
return t
def main(vhjaz,fvh,fvih,fipref):
tzfl=0
tzst={}
tzno={}
pref=[]
if fipref!=u'':
fpref=codecs.open(fipref,'rb',encoding='utf-8')
for s in fpref.readlines():
if ord(s[0]) == 65279:
s=s[1:]
s=s.replace(u'\r',u'')
if s[len(s)-1]==u'\n':
s=s[:len(s)-1]
pref.append(s)
fpref.close()
n=u''
# f0=codecs.open(fvh,'rb',encoding='utf-8')
f1=codecs.open(fvih, 'w', 'utf-8')
insite=wikipedia.getSite(vhjaz,fam = u'wikipedia')
tblredir = {}
# open xml dump and read page titles out of it
dump = xmlreader.XmlDump(fvh)
redirR = wikipedia.getSite().redirectRegex()
readPagesCount = 0
for entry in dump.parse():
readPagesCount += 1
# always print status message after 10000 pages
if readPagesCount % 10000 == 0:
print '%i pages read...' % readPagesCount
m = redirR.search(entry.text)
if m:
target = m.group(1)
# There might be redirects to another wiki. Ignore these.
for code in wikipedia.getSite().family.langs.keys():
if target.startswith('%s:' % code) or target.startswith(':%s:' % code):
wikipedia.output(u'NOTE: Ignoring %s which is a redirect to %s:' % (entry.title, code))
target = None
break
# if the redirect does not link to another wiki
if target:
target = target.replace('_', ' ')
# remove leading and trailing whitespace
target = target.strip()
# capitalize the first letter
if not wikipedia.getSite().nocapitalize:
target = target[0].upper() + target[1:]
if '#' in target:
target = target[:target.index('#')]
if '|' in target:
wikipedia.output(u'HINT: %s is a redirect with a pipelink.' % entry.title)
target = target[:target.index('|')]
# tblredir[entry.title] = target
# b=u'%s|%s\n' % (entry.title, target)
tt=entry.title
tc=target
tts=tt.split(u' ')
tcs=tc.split(u' ')
ltt=len(tts)
ltc=len(tcs)
if ltt==ltc and ltt>=2 and tt.lower()==tc.lower():
dop=1
for i in range(ltt):
if tts[i]!=tcs[i]:
if len(tts[i])!=len(tcs[i]):
dop=0
for j in range(len(tts[i])):
if (tts[i][j]==tts[i][j].lower() and
tcs[i][j]!=tcs[i][j].lower()):
dop=0
# for j in range(len(tcs[i])):
# if j>0 and tcs[i][j]!=tcs[i][j].lower():
# dop=0
b=u'%d|%s|%s\n' % (dop,tt, tc)
f1.write(b)
f1.flush()
vhjaz = u'eo'
#vhjaz = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvh = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvih = wikipedia.argHandler(sys.argv[2], 'slov_iz_xml')
fpref=u''
if len(sys.argv)>=5:
fpref = wikipedia.argHandler(sys.argv[3], 'slov_iz_xml')
#fotch = wikipedia.argHandler(sys.argv[4], 'slov_iz_xml')
#mysite = wikipedia.getSite()
#otch = codecs.open(fotch, 'w', 'utf-8')
mysite = wikipedia.getSite()
try:
main(vhjaz,fvh,fvih,fpref)
except:
wikipedia.stopme()
raise
else:
wikipedia.stopme()
</nowiki>
* podg_mma_f1_v2.py
# -*- coding: UTF-8 -*-
__version__='$Id:'
import wikipedia
import re, sys
import codecs
import xmlreader
from bib_tekst import *
def ubrkoment (text):
# Ignore tekst within nowiki tags and HTML comments
nowikiOrHtmlCommentR = re.compile(r'<nowiki>.*?|<!--.*?-->', re.IGNORECASE | re.DOTALL)
match = nowikiOrHtmlCommentR.search(text)
while match:
text = text[:match.start()] + text[match.end():]
match = nowikiOrHtmlCommentR.search(text)
return text
def korr_im (jaz,t,pref):
for p in pref:
if t.startswith(p):
t=t[len(p):]
break
t=perv_upper(t)
if u':' in t:
if ( (not t.startswith(u'Category:')) and
(not t.startswith(u'Image:')) ):
katprefi=mysite.family.category_namespace(jaz)
if not t.startswith(katprefi+u':'):
izprefi=mysite.family.image_namespace(jaz)
if not t.startswith(izprefi+u':'):
return u''
return t
def main(vhjaz,fvh,fvih,fipref):
tzfl=0
tzst={}
tzno={}
tblz={}
pref=[]
if 1:
fpref=codecs.open(fipref,'rb',encoding='utf-8')
for s in fpref.readlines():
if ord(s[0]) == 65279:
s=s[1:]
s=ubr_nk_prob(s)
tblz[s]=1
fpref.close()
n=u''
# f0=codecs.open(fvh,'rb',encoding='utf-8')
f1=codecs.open(fvih, 'w', 'utf-8')
insite=wikipedia.getSite(vhjaz,fam = u'wikipedia')
tblredir = {}
# open xml dump and read page titles out of it
dump = xmlreader.XmlDump(fvh)
redirR = wikipedia.getSite().redirectRegex()
readPagesCount = 0
for entry in dump.parse():
readPagesCount += 1
# always print status message after 1000 pages
if readPagesCount % 1000 == 0:
print '%i pages read...' % readPagesCount
m = redirR.search(entry.text)
if m:
target = m.group(1)
# There might be redirects to another wiki. Ignore these.
for code in wikipedia.getSite().family.langs.keys():
if target.startswith('%s:' % code) or target.startswith(':%s:' % code):
wikipedia.output(u'NOTE: Ignoring %s which is a redirect to %s:' % (entry.title, code))
target = None
break
# if the redirect does not link to another wiki
if target:
target = target.replace('_', ' ')
# remove leading and trailing whitespace
target = target.strip()
# capitalize the first letter
if not wikipedia.getSite().nocapitalize:
target = target[0].upper() + target[1:]
if '#' in target:
target = target[:target.index('#')]
if '|' in target:
wikipedia.output(u'HINT: %s is a redirect with a pipelink.' % entry.title)
target = target[:target.index('|')]
# tblredir[entry.title] = target
# b=u'%s|%s\n' % (entry.title, target)
tt=entry.title
tc=target
ttk=tt.replace(u'-',u' ').replace(u'.',u' ').replace(u'(',u' ')
tck=tc.replace(u'-',u' ').replace(u'.',u' ').replace(u'(',u' ')
tts=ttk.split(u' ')
tcs=tck.split(u' ')
ltt=len(tts)
ltc=len(tcs)
if tblz.has_key(tc) and ltt>=2:
dop=1
dop2=0
for i in range(ltt):
if len(tts[i])>=2 and tts[i][1:]!=tts[i][1:].lower():
dop2=1
for i in range(ltc):
# if ( ( len(tcs[i])>2 or
# (len(tcs[i])==2 and tcs[i][1]!=u'.') ) and
# tcs[i]==tcs[i].upper() ):
if len(tcs[i])>=2 and tcs[i][1:]!=tcs[i][1:].lower():
dop=0
if dop2==0:
dop=0
b=u'%d|%s|%s\n' % (dop,tt,tc)
f1.write(b)
f1.flush()
vhjaz = u'eo'
#vhjaz = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvh = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvih = wikipedia.argHandler(sys.argv[2], 'slov_iz_xml')
fpref = wikipedia.argHandler(sys.argv[3], 'slov_iz_xml')
#fotch = wikipedia.argHandler(sys.argv[4], 'slov_iz_xml')
#mysite = wikipedia.getSite()
#otch = codecs.open(fotch, 'w', 'utf-8')
mysite = wikipedia.getSite()
try:
main(vhjaz,fvh,fvih,fpref)
except:
wikipedia.stopme()
raise
else:
wikipedia.stopme()
</nowiki>
* podg_mma_f2.py
# -*- coding: UTF-8 -*-
__version__='$Id:'
import wikipedia
import re, sys
import codecs
import xmlreader
from bib_tekst import *
def ubrkoment (text):
# Ignore tekst within nowiki tags and HTML comments
nowikiOrHtmlCommentR = re.compile(r'<nowiki>.*?|<!--.*?-->', re.IGNORECASE | re.DOTALL)
match = nowikiOrHtmlCommentR.search(text)
while match:
text = text[:match.start()] + text[match.end():]
match = nowikiOrHtmlCommentR.search(text)
return text
def korr_im (jaz,t,pref):
for p in pref:
if t.startswith(p):
t=t[len(p):]
break
t=perv_upper(t)
if u':' in t:
if ( (not t.startswith(u'Category:')) and
(not t.startswith(u'Image:')) ):
katprefi=mysite.family.category_namespace(jaz)
if not t.startswith(katprefi+u':'):
izprefi=mysite.family.image_namespace(jaz)
if not t.startswith(izprefi+u':'):
return u''
return t
def main(vhjaz,fvh,fvih,fipref):
tzfl=0
tzst={}
tzno={}
tblz={}
pref=[]
if 1:
fpref=codecs.open(fipref,'rb',encoding='utf-8')
for s in fpref.readlines():
if ord(s[0]) == 65279:
s=s[1:]
s=s.replace(u'\r',u'')
if s[len(s)-1]==u'\n':
s=s[:len(s)-1]
st=s.split(u'|')
if len(st)==3 and st[0]==u'1':
tblz[st[1]]=st[2]
fpref.close()
n=u''
# f0=codecs.open(fvh,'rb',encoding='utf-8')
f1=codecs.open(fvih, 'w', 'utf-8')
insite=wikipedia.getSite(vhjaz,fam = u'wikipedia')
tblredir = {}
# open xml dump and read page titles out of it
dump = xmlreader.XmlDump(fvh)
redirR = wikipedia.getSite().redirectRegex()
readPagesCount = 0
sch_str=0
sch_zam=0
for entry in dump.parse():
readPagesCount += 1
# always print status message after 1000 pages
if readPagesCount % 1000 == 0:
print '%i pages read...' % readPagesCount
m = redirR.search(entry.text)
if m:
pass
else:
if entry.title.startswith(u'Vikipedio:'):
continue
fperv=1
for tt, tc in tblz.iteritems():
if ( (not entry.title.startswith(tt)) and
(not entry.title.startswith(tc)) and (tt in entry.text) ):
if fperv:
b=u'|malmajuskligo\n%s\n'%entry.title
f1.write(b)
f1.flush()
sch_str+=1
b=((u'?=======\n1l\n-=======\n%s\n+=======\n%s\n')%(tt,tc))
f1.write(b)
f1.flush()
fperv=0
sch_zam+=1
if fperv==0:
f1.write(u'========================\n')
f1.flush()
wikipedia.output(u'sch_str=%d sch_zam=%d'%(sch_str,sch_zam))
vhjaz = u'eo'
#vhjaz = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvh = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvih = wikipedia.argHandler(sys.argv[2], 'slov_iz_xml')
fpref = wikipedia.argHandler(sys.argv[3], 'slov_iz_xml')
#fotch = wikipedia.argHandler(sys.argv[4], 'slov_iz_xml')
#mysite = wikipedia.getSite()
#otch = codecs.open(fotch, 'w', 'utf-8')
mysite = wikipedia.getSite()
try:
main(vhjaz,fvh,fvih,fpref)
except:
wikipedia.stopme()
raise
else:
wikipedia.stopme()
</nowiki>
* svoj-ka.py
# -*- coding: UTF-8 -*-
__version__='$Id:'
import wikipedia, pagegenerators
import re, sys
import codecs
import perevod,imagetransfer1
def iskat(t,s):
lt=len(t)
ls=len(s)
i=0
while i<=lt-ls:
if t[i:i+ls]==s:
return i
i+=1
return -1
def iskats(t,i,s):
lt=len(t)
ls=len(s)
while i<=lt-ls:
if t[i:i+ls]==s:
return i
i+=1
return -1
def iskato(t,i,s):
ls=len(s)
while i>=0:
if t[i:i+ls]==s:
return i
i-=1
return -1
def vivod(b):
wikipedia.output(b)
otch.write(b)
otch.flush()
def ubr_nk_prob(t):
if len(t)<1:
return t
while (t[0]==u' ') or (t[0]==u'\n') or (t[0]==u'\t') or (t[0]==u'\r'):
t=t[1:]
if len(t)<1:
return t
while (t[len(t)-1]==u' ') or (t[len(t)-1]==u'\n') or (t[len(t)-1]==u'\t') or (t[len(t)-1]==u'\r'):
t=t[:len(t)-1]
if len(t)<1:
return t
return t
otch.write(u'\n\n\n')
otch.flush()
zapis_fimen(fimen,nomerf,pl)
return
def main():
mysite = wikipedia.getSite()
# sico=wikipedia.getSite(code = u'commons',fam = u'commons')
global otch
otch = codecs.open(u'%s' % filename, 'w', 'utf-8')
# for page in mysite.allpages(start = u'', namespace = 6):
# vivod(page.title()+u'\n')
for page in mysite.allpages(start = u'', namespace = 14):
vivod(page.title()+u'\n')
filename = wikipedia.argHandler(sys.argv[1], 'cht_cat')
for arg in sys.argv[2:]:
arg = wikipedia.argHandler(arg, 'ivsen')
if arg:
if arg=="-log":
import logger
sys.stdout = logger.Logger(sys.stdout, filename = 'com-izobr.log')
try:
main()
except:
wikipedia.stopme()
raise
else:
wikipedia.stopme()
* isk_ssilk.py
# -*- coding: UTF-8 -*-
__version__='$Id:'
import wikipedia
import re, sys
import codecs
from bib_tekst import *
def vivod(b):
wikipedia.output(b)
otch.write(b)
otch.flush()
def isk_kateg(n):
pl = wikipedia.Page(mysite,n)
wikipedia.output(pl.title())
try:
ot = pl.get()
except wikipedia.NoPage:
b = u'- %s\n\n' % pl.title()
vivod(b)
return
except wikipedia.IsRedirectPage:
b = u'+ %s\n-> %s\n\n' % (pl.title(),pl.getRedirectTarget())
vivod(b)
return
otch.write(u'+ %s\n' % pl.title())
otch.flush()
otch2.write(u'=========================\n%s\n-=======\n%s\n' % (pl.title(),ot))
otch2.flush()
katprefi=mysite.family.category_namespace(mysite.lang)
lip=pl.linkedPages()
for pl2 in lip:
t=pl2.title()
# if t.startwith(katprefi+u':') or t.startwith(u':'+katprefi+u':'):
frez.write(t+u'\n')
def main():
n=[]
f=codecs.open(filename,'rb',encoding='utf-8')
i=0
j=0
for s in f.readlines():
if ord(s[0]) == 65279:
s=s[1:]
s=s.replace(u'\r',u'')
if s[len(s)-1]==u'\n':
s=s[:len(s)-1]
isk_kateg(s)
fi = wikipedia.argHandler(sys.argv[1], 'cht_cat')
filename = fi+'.txt'
fotch = fi+'_ot.txt'
fotch2 = fi+'_op.txt'
firez = fi+'_sk.txt'
#filename = wikipedia.argHandler(sys.argv[1], 'cht_cat')
#fotch = wikipedia.argHandler(sys.argv[2], 'cht_cat')
#fotch2 = wikipedia.argHandler(sys.argv[3], 'cht_cat')
fliwi=1
if len(sys.argv)>=3 and sys.argv[2]==u'w':
fliwi=0
otch = codecs.open(fotch, 'w', 'utf-8')
otch2 = codecs.open(fotch2, 'w', 'utf-8')
frez = codecs.open(firez, 'w', 'utf-8')
mysite = wikipedia.getSite()
try:
main()
except:
wikipedia.stopme()
raise
else:
wikipedia.stopme()