Also, pinyin can be placed over hanzi (like Ruby
It is advisable to replace | with / in the segmented version before taking the pinyin transcription (and say "no" if it asks "segment first?" again). Then the / characters will still be present in the pinyin. This is desirable because, if you happen to hit a dictionary entry with a space in it (such as zai4wo3), it will show up as one word in the segmentation but two words in the pinyin; having the /s in gives the script something other than spaces to synchronize on (but it will try to synchronize on spaces as well).
You can save it as a .py file or paste it into a Python interpreter.
o=open("pinyin.html","w")
o.write("<html><head><meta http-equiv=Content-type content=\"text/html; charset=utf-8\"></head><body><style>ruby { display: inline-table; } ruby * { display: inline; line-height:1.0; text-indent:0; text-align:center; white-space: nowrap; } rb { display: table-row-group; font-size: 100%; } rt { display: table-header-group; font-family: Gandhari, DejaVu Sans, Lucida Sans Unicode, serif; }</style>")
for pPara,hPara in zip(open('pinyin.u8').read().replace("\r\n","\n").split("\n"),open('segmented.u8').read().decode('utf-8').replace(u'\u3002',u'\u3002 ').replace('|','').encode('utf-8').replace("\r\n","\n").split("\n")):
if pPara.replace(" ","")==hPara.replace(" ",""): # probably a paragraph with no pinyin (wenlin transcription may have changed some spacing)
o.write(hPara.replace("/","").replace("|","")+"<p>") ; continue # (still pick up stray / or | at start)
for pinyin,hanzi in zip(pPara.split("/"),hPara.split("/")):
p2,h2 = pinyin.strip().split(),hanzi.strip().split()
if not len(p2)==len(h2) and len(p2)<10: p2,h2=[pinyin],[hanzi]
while len(p2)>len(h2): h2.append("") # in case stray word(s) at end
while len(h2)>len(p2): p2.append("") # ditto
for pinyin,hanzi in zip(p2,h2):
if hanzi==pinyin: pinyin="-"
o.write("<ruby><rb>"+hanzi+"</rb><rt>"+pinyin+"</rt></ruby>\n")
if pPara or hPara: o.write("<p>")
o.write("</body></html>")
If you are programming a GUI then instead of writing to HTML you
might prefer to use a Tkinter text widget. Below is a version of the above
script that inserts the result into Tkinter instead of producing an HTML file.
(You need to set up the Tkinter text widget and call the function.)
def insert_into_text_widget (text_widget, pinyin_u8str, segmented_u8str):
pinyin = pinyin_u8str.decode('utf-8').replace("\r\n","\n")
segmented = segmented_u8str.decode('utf-8').replace(u'\u3002',u'\u3002 ').replace('|','').replace("\r\n","\n")
widgets = []
import Tkinter
for pPara,hPara in zip(pinyin.split("\n"),segmented.split("\n")):
if pPara.replace(" ","")==hPara.replace(" ",""):
if hPara.strip(): text_widget.insert(Tkinter.INSERT,hPara.replace("/","").replace("|","")+"\n\n")
continue
firstWord = 1
for pinyin,hanzi in zip(pPara.split("/"),hPara.split("/")):
p2,h2 = pinyin.strip().split(),hanzi.strip().split()
if not len(p2)==len(h2) and len(p2)<10: p2,h2=[pinyin],[hanzi]
while len(p2)>len(h2): h2.append("")
while len(h2)>len(p2): p2.append("")
for pinyin,hanzi in zip(p2,h2):
if hanzi==pinyin: pinyin="-"
if not firstWord: text_widget.insert(Tkinter.INSERT," ") # (you can increase that space's width if you want)
firstWord = 0
widgets.append(Tkinter.Label(text_widget.master, text=pinyin+"\n"+hanzi, font=text_widget['font'], foreground=text_widget['foreground'], background=text_widget['background']))
text_widget.window_create(Tkinter.INSERT,window=widgets[-1])
if pPara or hPara: text_widget.insert(Tkinter.INSERT,"\n\n")
return widgets # a list of the created widgets (in case it's useful for changing the font later, etc)
curWord=[] ; isChinese = 0 ; inTag = 0 ; out=[]
for x in open("notes.u8").read().decode("utf-8")+"\n": # add \n to ensure last word is output
if inTag:
out.append(x)
if x==">": inTag=0
continue
if ord('A')<=ord(x)<=ord('Z') or ord('a')<=ord(x)<=ord('z') or 0xC0<=ord(x)<=0x1DC:
curWord.append(x)
if ord(x)>=0xC0: isChinese = 1
else:
if curWord:
curWord=u"".join(curWord)
if curWord.lower() in "de le ne ma zhe shang guo ge".split(): isChinese=1
if isChinese: out.append("<py>")
out.append(curWord)
if isChinese: out.append("</py>")
isChinese=(0x3000<=ord(x)<0xa700 or ord(x)>=0x10000)
if isChinese: out.append("<hanzi>")
if x.strip(): out.append(x) # not whitespace
elif out and not out[-1]=="\n": out.append("\n")
if isChinese: out.append("</hanzi>")
curWord=[] ; isChinese = 0
inTag=(x=="<")
open("notes.html","w").write("<style>.py { color: blue; } .hanzi { color: purple; }</style>"+"".join(out).replace("</hanzi><hanzi>","").replace("</hanzi>\n<hanzi>","\n").replace("</py><py>","").replace("</py>\n<py>","\n").replace("<hanzi>","<SPAN CLASS=hanzi>").replace("</hanzi>","</SPAN>").replace("<py>","<SPAN CLASS=py>").replace("</py>","</SPAN>").encode("utf-8"))
import re;open("pinyin.u8","w").write(re.sub(u"\u3010\u25ce *Fix:[^\u25ce]*\u25ce","",re.sub(u";\u25ce[^\u3011]*\u3011","",open("pinfix.u8").read().decode("utf-8"))).encode("utf-8"))
If the ambiguities you are fixing are in segmentation, then you could also
try the alternative script below, which, instead of choosing the first
option, merely adds together all the possible split points. This should
avoid any incorrect grouping of syllables, although some syllables will
not be grouped when they should be. May be useful in conjuction with the
above pinyin-over-characters scripts. Input is segfix.u8, output is segmented.u8.
If replacing | with /, do not do it until after this script.
data=open("segfix.u8").read().decode('utf-8')
out=open("segmented.u8","w") ; i=0
while i<len(data):
i2=data.find(u"\u3010\u25ceFix:\u25ce",i)
if i2==-1: i2=len(data)
out.write(data[i:i2].encode('utf-8'))
if i2==len(data): break
i = i2+7 ; i2 = data.find(u"\u3011",i)
alternatives = data[i:i2].split(u";\u25ce")
result = alternatives[0].replace(" | ","")
splitAfter = [0]*len(result)
for alt in alternatives:
tot = 0
for word in alt.split(" | "):
tot += len(word)
if tot<len(result): splitAfter[tot-1]=1
for i in range(len(result)-1,-1,-1):
if splitAfter[i]: result=result[:i+1]+" | "+result[i+1:]
out.write(result.encode('utf-8'))
i=i2+1
characters [pin1 yin1] /meaning/
or
traditional simplified [pin1 yin1] /meaning/
then you can convert to Wenlin cidian entry-list format, optionally using Wenlin's existing dictionary to make corrections to the pinyin and/or the traditional/simplified conversion. (You can then re-export if you need a corrected CEDICT for personal use of some other application, or if the dictionary's scope is such that the Wenlin corrections are fair use.)
If you don't need to make corrections, you can skip to the main script below.
Otherwise, with the CEDICT file saved in cedict.u8, first run this small script to save the first two words of every line to word1.u8 and word2.u8:
o1,o2=open("word1.u8","w"),open("word2.u8","w")
for l in open("cedict.u8"):
l=l.split()
if len(l):
o1.write(l[0]+"\n")
if len(l)>1: o2.write(l[1]+"\n")
else: o2.write("\n")
o1.close() ; o2.close()
Then, if you want Wenlin to correct the traditional-to-simplified conversion, make
a "Simple form characters" transcription of word1.u8 and save
it as simple.u8, otherwise, make sure simple.u8 does not exist.
You do not have to create simple.u8 if the resulting cidian list is to be
imported into Wenlin's dictionary, since the import process will do it
anyway. But you may want to do this step manually if the cidian
is to be re-exported with corrections without actually adding to Wenlin,
since otherwise only one version of the characters will be retained.
(You don't have to fix ambiguities; the script will not attempt to correct
entries that are still ambiguous.)
Similarly, if you want to correct the simplified-to-traditional conversion (in cases where this is not ambiguous), make a "Full form characters" transcription of word2.u8 and save it as full.u8, otherwise, make sure full.u8 does not exist.
If you want Wenlin to correct the pinyin, you can then open word1.u8, segment it, do a pinyin transcription, replace tone marks with 1-4, and save that as word1.u8 (replacing it). You don't have to fix the ambiguities; the script below will attempt to correct an entry only if there are no ambiguities to fix in the correction. If you leave word1.u8 un-transcribed (or not created), then pinyin correction will not be attempted at all.
If you have both traditional and simplified versions in the list, it may be better to source the pinyin corrections from the simplified i.e. word2.u8 (but save the result as word1.u8) as this is less susceptible to causing Wenlin to fail to recognise a word due to a wrong choice of traditional character. Another option is to run the whole process twice, the first time taking pinyin from full form and the second time taking pinyin from the full.u8 corrections (you need to re-export to cedict in between the two runs if you are doing this). In all cases, save Wenlin's pinyin as word1.u8.
The script below will take cedict.u8, and possibly word1.u8, full.u8 and/or simple.u8, and produce entries.u8.
o=open("entries.u8","w") ; o.write("cidian\n")
count=0
def genNull():
while True: yield ""
def tryOpen(fname):
try: f=open(fname)
except IOError: f=genNull()
return f
fw,simp,full = tryOpen("word1.u8"),tryOpen("simple.u8"),tryOpen("full.u8")
import re
for l,corPinyin,corSimp,corFull in zip(open("cedict.u8"),fw,simp,full):
if not "[" in l or not "/" in l: continue # a comment
l =l .decode("utf-8").replace(u"\uff0c",",").strip()
corPinyin=corPinyin.decode("utf-8").replace(u"\uff0c",",").strip()
corSimp =corSimp .decode("utf-8").replace(u"\uff0c",",").strip()
corFull =corFull .decode("utf-8").replace(u"\uff0c",",").strip()
chars = l[:l.index(" ")]
chars2 = l[l.index(" ")+1:l.index("[")].strip()
if not chars2: chars2=chars
make_2_entries = False
if corFull and not "Fix:" in corFull: chars=corFull # unambiguous conversion to trad - definite override
if "Fix:" in corSimp: corSimp=chars2
elif chars2 and not corSimp==chars2 and corSimp==chars:
# ouch, traditional maps to itself and cedict's simplified is different: cedict may be specifying 2 alternative readings instead of trad+simp
make_2_entries = True
if not len(corSimp)==len(chars2): corSimp=chars2 # either there wasn't one or there's some corruption
chars=list(chars)
for i in range(len(chars)):
if corSimp[i]==chars[i]: chars[i]="-"
chars=u"".join(chars)
if chars==("-"*len(corSimp)): chars=corSimp
else: chars = corSimp+u"["+chars+u"]"
pinyin = l[l.index("[")+1:l.index("]")].replace("5","").replace("u:","v").replace("U:","V")
if "Fix:" in corPinyin or not corPinyin: corPinyin=pinyin
else:
corPinyin=corPinyin.replace(u"\u201c","").replace(u"\u201d","")
for c in corPinyin:
if ord(c)>=0x3000:
corPinyin=pinyin ; break
corPinyin=re.sub(" ([aAeEoO])",r"'\1",corPinyin).replace(" ","").replace(",",", ")
o.write(("*** \npinyin "+corPinyin+"\ncharacters "+chars+"\nserial-number CEDict"+str(count)+"\ndefinition "+l[l.index("/")+1:l.rindex("/")]+"\nh\nimported from CEDICT; not manually checked\n").encode("utf-8"))
if make_2_entries: o.write(("*** \npinyin "+corPinyin+"\ncharacters "+chars2+"\nserial-number CEDict-B"+str(count)+"b\ndefinition "+l[l.index("/")+1:l.rindex("/")]+"\nh\nimported from CEDICT; not manually checked\n").encode("utf-8"))
count += 1
o.close()
If running this more than once, be sure to
change the CEDict after the serial-number
unless you want to replace previous entries.
You may also want to change the
"imported from CEDICT; not manually checked" message.
You will then need to use Wenlin to convert tone numbers to tone marks.
words={}
curW=[]
for c in open('segmented.u8').read().decode('utf-8'):
if 0x4e00<=ord(c)<0xa700 or ord(c)>=0x10000: curW.append(c)
elif curW and c.strip():
words[u''.join(curW)]=1 ; curW=[]
words=words.keys() ; words.sort()
open('words.u8','w').write('\n'.join(words).encode('utf-8'))
To add pinyin to these words, make a pinyin transcription of
words.u8 (don't segment first) and save it as pinyin.u8,
then run
o=open("output.u8","w")
for w,p in zip(open("words.u8"),open("pinyin.u8")): o.write(w.strip()+"\t"+p.strip()+"\n")
o.close()
result is in output.u8 (tab-delimited).
Or if you prefer working with an incomplete cidian format, run this instead:
o=open("output.u8","w") ; o.write("cidian\n") ; count=0
for w,p in zip(open("words.u8"),open("pinyin.u8")):
o.write("*** \npinyin "+p.strip()+"\ncharacters "+w.strip()+"\nserial-number temporary"+str(count)+"\ndefinition ?\n")
count += 1
o.close()
This can then be exported to CEDICT format if you want, but note the export
script will discard any non-Fixed ambiguities in the pinyin, and will not
make up for the lack of full-form equivalents (or simple-form equivalents if
you're working in full form).
import re; open("entries2.txt","w").write(re.sub(r"([A-Za-z][1-5])([aAeEoO])",r"\1'\2",open("entries.txt").read()))
This can be used as a preprocessor to Wenlin's conversion
to tone marks. (However, it is not needed for the above cedict import.)
open("entries2.u8","w").write("".join(filter(lambda x: "my phrase" in x, ["***"+e+"\n" for e in open("entries.u8").read().replace("\r\n","\n").split("\n***")[1:]])))
The following Python script takes two files: entries.u8 is the entry list, and segmented.u8 is the Wenlin-segmented version of it (you don't have to fix anything that needs fixing). It outputs to entries2.u8 any entries for words that Wenlin didn't recognise. You can save it as a .py file or paste it into a Python interpreter.
known = {}
for w in open("segmented.u8").read().split():
if "[" in w: w=w[:w.index("[")]
known[w]=1
o=open("entries2.u8","w")
o.write("cidian.db\n")
count=total=0
for entry in ["***"+e+"\n" for e in open("entries.u8").read().replace("\r\n","\n").split("\n***")]:
if not "\ncharacters" in entry: continue
total+=1
l=entry[entry.index("\ncharacters")+1:] ; l=l[:l.index("\n")]
if "[" in l: l=l[:l.index("[")]
if not l.split()[1] in known:
o.write(entry) ; count+=1
print "Written %d entries (out of %d)" % (count,total)
import re
omit={}
for o in open("omit.txt").read().lower().split(): omit[o]=1
o=open("yentries.u8","w") ; o.write("yinghan\n")
defs={}
for e in open("centries.u8").read().replace("\r\n","\n").split("\n***")[1:]:
chars=en=None
for l in e.split("\n"):
l=re.sub(r"\([^)]*\)","",l)
if not l.strip().split(): continue
if l.startswith("characters"): chars=" ".join(l.split()[1:])
elif "definition" in l.split()[0] and len(l.strip().split())==2: en=l.strip().split()[1]
elif l=="h": break # (and ignore pinyin - it may be inaccurate anyway if the data originally came from an en-to-zh wordlist)
if chars and en and re.match(r"^[A-Za-z]*$",en) and not en.lower() in omit:
if not en in defs: defs[en]=[]
if chars not in defs[en]: defs[en].append(chars)
for en,dList in defs.items():
o.write("*** \n"+en+"\nautomatic\n")
for d in dList: o.write("definition "+d+"\n")
o.close()
import re
entries=open("entries.u8").read().replace("\r\n","\n").split("\n***")
for i in range(1,len(entries)):
if re.search(r"\n[0-9]*definition[ \t][^A-Za-z]*[A-Z]",entries[i]):
lines=entries[i].split("\n")
for li in range(len(lines)):
words=lines[li].decode('utf-8').split()
if len(words)>=2 and words[0]=="pinyin":
words[1]=words[1][0].upper()+words[1][1:]
lines[li]=" ".join(words).encode('utf-8')
break
entries[i]="\n".join(lines)
open("entries.u8","w").write("\n***".join(entries))
Before running this, set Wenlin to use simplified characters (so the full form are in []s), extract the changed cidian entries, and use Wenlin's "Replace tone marks with 1-4" function. Input is entries.u8, output is cedict.u8.
def add_5(pinyin):
pinyin += "@@@" # termination
i=0
while i<len(pinyin):
pl=pinyin.lower()
if pl[i] in "aeiouvr" and pl[i+1] not in "aeiouv12345":
if pl[i+1:i+3]=="ng" and not pl[i+3] in "aeiouv":
if pl[i+3] not in "12345": pinyin=pinyin[:i+3]+"5"+pinyin[i+3:]
elif (pl[i+1]=="n" or pl[i:i+2]=="er") and not pl[i+2] in "aeiouv" and not pl[i]=="r":
if pl[i+2] not in "12345": pinyin=pinyin[:i+2]+"5"+pinyin[i+2:]
else: pinyin=pinyin[:i+1]+"5"+pinyin[i+1:]
i+=1
return pinyin[:-3] # remove the @@'s
import string
o=open("cedict.u8","w")
for e in open("entries.u8").read().replace("\r\n","\n").split("\n***")[1:]:
en = []; py=ch=None
for l in e.split("\n"):
if l.startswith("pinyin"):
py=add_5(''.join(l.split()[1:])).replace("1","1 ").replace("2","2 ").replace("3","3 ").replace("4","4 ").replace("5","5 ").replace("v","u:").replace("V","U:").replace(",",", ").decode('utf-8').replace(unichr(0xb7),unichr(0xb7)+" ")
for c in u"*\u00b9\u00b2\u00b3'-": py=py.replace(c,"")
py=py.encode('utf-8').strip()
elif l.startswith("characters"):
ch=' '.join(l.split()[1:]).decode('utf-8').replace(",",u"\uff0c")
if '[' in ch:
trad=list(ch[ch.index("[")+1:ch.index("]")])
ch=ch[:ch.index("[")] ; chLen=len(ch)
for i in range(len(trad)):
if trad[i]=="-": trad[i]=ch[i]
ch=u"".join(trad)+" "+ch
else:
chLen=len(ch)
ch=ch+" "+ch
elif l.strip() and "definition" in l.split()[0]: en.append(' '.join(l.split()[1:]))
elif l=="h": break
if py and ch and en:
py_alt = py
for tone in ["1","2","3","4","5"]: py_alt=py_alt.replace("e"+tone+" r5","er"+tone)
if chLen==len(py_alt.split()): py=py_alt # spurious mising out 'r' when adding tone marks
if chLen==len(py.split()):
o.write(ch.encode("utf-8") + " ["+py+"] /"+"/".join(en)+"/\n")
# or if you want quoted comma-separated format:
# o.write('"'+ch.encode("utf-8").replace(' ','","')+'","'+py+'","'+"/".join(en)+'"\n')
else: print "Warning: Omitting ["+py+"] because "+str(len(py.split()))+" syllables against "+str(chLen)+" characters (conversion problem?)"
o.close()
import string,commands,os,sys
oCE=open("pleco-CE.txt","w")
oEC=open("pleco-EC.txt","w")
import string
def decodeSlash(headword):
# parses headword into a list, each item being either a
# single character, or character+slash+character
assert not "//" in headword, "// not supported here"
headword = list(headword) ; i=0
while i<len(headword)-1:
if headword[i+1]=='/':
headword[i] = headword[i]+headword[i+1]+headword[i+2]
del headword[i+1] ; del headword[i+1]
i += 1
return headword
pyList,chList,enList,notesList = [],[],[],[]
for e in open("entries.u8").read().replace("\r\n","\n").split("\n***")[1:]:
en = [] ; notes=[] ; py=ch=appendMode=None ; nextEnv=""
for l in e.split("\n"):
if appendMode: notes.append(l)
elif l.startswith("pinyin"): py=' '.join(l.split()[1:])
elif l.startswith("characters"):
ch=' '.join(l.split()[1:]).decode('utf-8').replace(",",u"\uff0c")
if '[' in ch:
trad=decodeSlash(ch[ch.index("[")+1:ch.index("]")])
simp=decodeSlash(ch[:ch.index("[")])
assert len(simp)==len(trad)
for i in range(len(trad)):
if trad[i].endswith("-") and (trad[i]=="-" or len(simp[i])==1): trad[i]=trad[i][:-1]+ch[i] # either a - by itself, or char/- (but we don't touch it if the simp is also complex)
ch=ch[:ch.index("[")]+"["+u"".join(trad)+"]"
elif l.strip() and "environment" in l.split()[0]: nextEnv="<"+' '.join(l.split()[1:])+"> "
elif l.strip() and "definition" in l.split()[0]:
en.append(nextEnv+' '.join(l.split()[1:]))
nextEnv = ""
elif l=="h": appendMode = 1
if py and ch and en:
pyList.append(py)
chList.append(ch)
enList.append(en)
notesList.append(notes)
# now write out
for py,ch,en,notes in zip(pyList,chList,enList,notesList):
# C->E entry:
oCE.write(ch.encode("utf-8")+"\t"+py+"\t"+"; ".join(en+filter(lambda x:x,notes)).replace("\t"," ")+"\n")
# E->C entries:
if len(en)>1: notes=en+notes # ensure the en entries have all the definitions in them
for head in en: oEC.write(head+"\t"+ch.encode("utf-8")+" "+py+". "+"; ".join(filter(lambda x:x,notes)).replace("\t"," ")+"\n")
oCE.close() ; oEC.close()
import unicodedata,sqlite3,sys
e2hp, p2he, h2pe = sqlite3.connect("ce1.sqlite"),sqlite3.connect("ce2.sqlite"),sqlite3.connect("ce3.sqlite")
removed=added=processed=0
def addToDict(connection,uTerm,uDefinition,uSerial,uSortKey=None):
# we put our serial number in E1 (or E10 for short entries) so we can recognise and update our own entries later
searchString=''.join((c for c in unicodedata.normalize('NFD',uTerm) if unicodedata.category(c)!='Mn' and unicodedata.category(c)[0]!='Z')).upper()
e1e10=map(lambda x:searchString[:x],range(1,min(len(searchString),10)+1))+[u'']*max(10-len(searchString),0)
if not e1e10[-1]: e1e10[-1]=u"_"+uSerial
else: e1e10[0]=u"_"+uSerial
if not uSortKey: uSortKey=searchString[:15]
connection.execute("insert into Dictionary(Term,Definition,E1,E2,E3,E4,E5,E6,E7,E8,E9,E10,SortControl) values (?,?,?,?,?,?,?,?,?,?,?,?,?)",
(uTerm,uDefinition)+tuple(e1e10)+(uSortKey,))
global added ; added += 1
print "Reading cidian"
entries=open("entries.u8").read().replace("\r\n","\n").decode('utf-8').split("\n***")[1:]
print "Checking for old entries that are to be replaced"
serialNumbers = {}
for e in entries:
for l in e.split("\n"):
if l.startswith("serial"): serialNumbers[l.split()[1]]=1
elif l=="h": break
for con in [e2hp,p2he,h2pe]:
for row in con.execute("select e1,e10 from Dictionary"):
# (usually faster than sending speculative deletes)
for i in [0,1]:
if row[i][1:] in serialNumbers:
if i: what="E10"
else: what="E1"
removed += con.execute("delete from Dictionary where "+what+"=?",(row[i],)).rowcount
if removed%100==0:
print removed,"\r", ; sys.stdout.flush()
print "Adding new entries"
for e in entries:
enKeys = []; enDef=[]; chKeys=[]; py=ch=inComments=""
for l in e.split("\n")[1:]:
if inComments:
enDef.append(l) ; continue
lFirst,lRest = (l.split()+[''])[0],' '.join(l.split()[1:]).strip()
if l.startswith("serial"): sn=lRest
elif l.startswith("pinyin"): py=lRest
elif l.startswith("characters"):
ch=lRest
if '[' in ch:
trad=list(ch[ch.index("[")+1:ch.index("]")])
for i in range(len(trad)):
if trad[i]=="-": trad[i]=ch[i]
chKeys=[ch[:ch.index("[")],u"".join(trad)]
else: chKeys=[ch]
elif l=="h": inComments=1
elif lRest and not l.startswith("re") and not l.startswith("class") and not l.startswith("span") and not l.startswith("gr") and not l.startswith("freq"):
if "definition" in lFirst: enKeys.append(lRest)
elif "measure" in lFirst: lRest="MW "+lRest
if "example" in lFirst: lRest += ":"
elif not lRest[-1]==".": lRest += ";"
enDef.append(lRest)
if not py: continue # probably at the end
if enDef and enDef[-1][-1]==';': enDef[-1]=enDef[-1][:-1]
for k in enKeys: addToDict(e2hp,k,ch+" "+py,sn)
addToDict(p2he,py,ch+" "+" ".join(enDef),sn,chKeys[0])
for k in chKeys: addToDict(h2pe,k,py+" "+" ".join(enDef),sn,k)
processed += 1
if processed%100==0:
print processed,"\r", ; sys.stdout.flush()
e2hp.commit() ; p2he.commit() ; h2pe.commit()
print "Processed",processed,"cidian entries; added",added-removed,"new WM-Dict entries and updated",removed,"others"
Also add \end{CJK}. Do not add other TeX markup yet (some of it might be confused for pinyin later). Use Wenlin's "Replace tone marks with 1-4" function and save the file in the appropriate encoding, and in a Unix environment do
sed -e 's/[BCDFGHJ-NP-TV-Zbcdfghj-np-tv-z]\?h\?[AEIOUVaeiouv]\+[ngr]*[1-5]/\\&/g' -e 's/\Long/\LONG/g' -e 's/\long/\Long/g' < infile > outfilereplacing infile and outfile with the appropriate filenames. Then edit the LaTeX in any text editor as normal (adding documentclass etc). Remember to include \usepackage{CJK} and \usepackage{pinyin} in the preamble.
If you have trouble, please try a different TeX distribution. Some TeX distributions from around 2005 were particularly quirky with CJK (conflicts between usepackages, trouble with hanzi in PDF headings, unreliable UTF-8, ...) and if you have one of these then it's probably easier to upgrade it than to work around its flaws. However, if you're stuck (e.g. because some IT department forces you to use an inferior version of Linux with unusable package management) then you could try some workarounds:
\catcode`@=11 \def\ding#1{\py@hy d\py@i dn#1ng\py@sp{}} \catcode`@=12
after the \usepackage{pinyin}
\catcode`@=11
\let\MT@orig@py@macron\py@macron
\@ifpackagelater{pinyin}{2005/08/11}{
\def\py@macron#1#2{\let\pickup@font\MT@orig@pickupfont
\MT@orig@py@macron{#1}{#2}\let\pickup@font\MT@pickupfont}%
}{%
\def\py@macron#1{\let\pickup@font\MT@orig@pickupfont
\MT@orig@py@macron{#1}\let\pickup@font\MT@pickupfont}%
}\catcode`@=12
This should solve the problem of some tone marks being printed over spaces instead of
letters.open("hanzi.gb","w").write("".join(map(lambda l:l.split()[0],open("characters.txt").readlines()[1:])))
and open hanzi.gb in edit mode.
numHanzi = 492
import os, time
# CutePDF's default destination file for Wenlin
# (depends on if we're on Cygwin or native Windows)
if "HOME" in os.environ: f=os.environ["HOME"]+os.sep+"Wenlin.pdf"
else: f=os.environ["HOMEDRIVE"]+os.environ["HOMEPATH"]+"\\Wenlin.pdf"
try: os.remove(f) # in case you did a test print
except: pass
for h in range(numHanzi):
open("_wenlin_hanzi_vbs.vbs","w").write("\n".join([
'set WshShell = WScript.CreateObject("WScript.Shell")',
'WshShell.AppActivate "Wenlin"',
'WScript.Sleep 100',
'WshShell.SendKeys "+{RIGHT}^x^l^v~^e^p"', # Shift-Right Cut Lookup Paste Enter Edit Print, i.e. look up the 1st character and print it
'WScript.Sleep 100',
'WshShell.SendKeys "~"', # Enter (confirm print dialogue)
'WScript.Sleep 4000', # allow CutePDF enough time
'WshShell.AppActivate "Save As"', # ensure got CutePDF's dialogue
'WshShell.SendKeys "~"', # accept default Wenlin.pdf
'WScript.Sleep 100',
'WshShell.AppActivate "Wenlin"',
'WshShell.SendKeys "^w"', # close hanzi entry
]))
os.system("Cscript.exe _wenlin_hanzi_vbs.vbs")
os.remove("_wenlin_hanzi_vbs.vbs")
time.sleep(2)
p=None
while not p:
try: p=open(f,"rb")
except: time.sleep(1) # allow .pdf to be written
open(str(h)+".pdf","wb").write(p.read())
p.close()
os.remove(f)
pass # (so get the above blank line if pasting into interpreter)
for P in 0.pdf [1-9]*.pdf; do
gs -sDEVICE=pnggray -sOutputFile=myfile%02d.png -r28 -q -dNOPAUSE - < $P;
for M in myfile*.png; do
pngtopnm < $M | pnmcrop -white -top -bottom > $M.pnm;
done;
pnmcat -tb myfile*.png.pnm | pnmtopng -compression 9 > x$(echo $P|sed -e s/pdf/png/);
rm myfile*;
done
and remember to set got_extra to 1 in flashcards.html so that they will
display.