#!/usr/bin/python # coding=utf-8 # Copyright Johannes 2011 # mail@johanneswilm.org # Licensed under the GPL v. 3 def lyx2tex(filename): basefilename=filename[:-4] print "Converting " + filename + " -> "+basefilename+".tex..." runcommand('lyx -e luatex -f all '+filename) #print "Removing extra headers..." #regex_remove_shell(basefilename+'.tex',r'(?s)\\begin{document}(.*)\\end{document}') #print "Remove extra bibliographies..." # remove natbib bibliographies #regex_replace(basefilename+'.tex',r'\\bibliography{([^}]*)}','') print "Fixing latex..." runcommand('rpl "{[}" "[" '+basefilename+'.tex') runcommand('rpl "{]}" "]" '+basefilename+'.tex') runcommand('rpl -q ">{\\raggedright}" "" '+basefilename+'.tex') runcommand('rpl -q ">{\\raggedleft}" "" '+basefilename+'.tex') #runcommand('rpl -q "\\lyxdot " "." '+basefilename+'.tex') return filename[:-4]+'.tex' from sys import exit from commands import getstatusoutput def runcommand(command,exit_on_failure=True): output = getstatusoutput(command) if (output[0] != 0 and exit_on_failure==True): print command print output[1] exit() return output import re def regex_replace(filename,from_string,to_string): orig_filecontents = file(filename).read() replacement_file = open(filename,'w') replacement_file.write(re.sub(from_string, to_string, orig_filecontents)) replacement_file.close() return True def split_html(filename,words): splits = words/5000 regex_replace(filename,'

','') regex_replace(filename,'','') orig_filecontents = file(filename).read() orig_filecontents_plist = orig_filecontents.split('

') orig_filecontents_plist_len = len(orig_filecontents_plist) orig_filecontents_plist_len_part = orig_filecontents_plist_len/splits for split in range(splits): part_filecontents = '

' + "

".join(orig_filecontents_plist[(orig_filecontents_plist_len_part*split):(orig_filecontents_plist_len_part*(split+1))]) + '' part_file = open(filename[:-5]+'_'+str(split)+'.html','w') part_file.write(part_filecontents) part_file.close() runcommand('rm '+filename) return True def spacerepl(matchobj): return matchobj.group(0)+matchobj.group(1).replace(' ','SPACEMARK')+matchobj.group(2) def dotrepl(matchobj): return matchobj.group(0)+matchobj.group(1).replace('.','DOTMARK')+matchobj.group(2) def commarepl(matchobj): return matchobj.group(0)+matchobj.group(1).replace(',','COMMAMARK')+matchobj.group(2) def fit_for_html(html_file): regex_replace(html_file,'{}`','`')# cleaning up regex_replace(html_file,r'\\%','PERCENTAGE') regex_replace(html_file,r'%([^\n]*)\n',r'\n')# cleaning up regex_replace(html_file,'PERCENTAGE',r'\\%') regex_replace(html_file,r'^^','

') regex_replace(html_file,r'$$','

') regex_replace(html_file,r'\n\n',r'\n

\n') regex_replace(html_file,r'\n

',r'\n') regex_replace(html_file,r'\\begin{document}',r'\\begin{document}

\n

') regex_replace(html_file,r'\\\\','SLASH') regex_replace(html_file,r'\\\[','BEGINPARENTESIS') regex_replace(html_file,r'\\\]','ENDPARENTESIS') regex_replace(html_file,r'\\\{','BEGINBRACKET') regex_replace(html_file,r'\\\}','ENDBRACKET') regex_replace(html_file,r'\{\}','EMPTYBRACKET') regex_replace(html_file,r'\\protect','') regex_replace(html_file,r'\\(hline|tabularnewline|ldots| \& |item)',r'\\\1') # without contents regex_replace(html_file,r'\\(index){([^}]*)}',r'\\\1{\2endparentesis') # translate contents regex_replace(html_file,r'\\(citep|citet|citetitle|citeauthor|begin|includegraphics|label|bibliography|input|include|ref|vref|pageref)({|\[)([^}]*)}',r'\\\1\2\3endparentesis') # don't translate contents regex_replace(html_file,r'\\(end)({|\[)([^}]*)}',r'

\\\1\2\3endparentesis

') # don't translate contents and add paragraph end regex_replace(html_file,r'\\(emph){([^}]*)}',r'\\\1{\2endparentesis') # translate contents regex_replace(html_file,r'\\(chapter|section|subsection|subsubsection|chapter\*|section\*|subsection\*|subsubsection\*|caption|footnote){([^}]*)}',r'\\\1{\2}') # translate contents regex_replace(html_file,r'\\(chapter|section|subsection|subsubsection|chapter\*|section\*|subsection\*|subsubsection\*|caption|footnote)\[([^\]]*)\]{([^}]*)}',r'\\\1[\2]{\3}') # translate contents regex_replace(html_file,r'\\(addcontentsline)({[^}]*})({[^}]*}){([^}]*)}',r'\\\1\2\3{\4}') # translate some contents regex_replace(html_file,r'\\(item) \[{([^}]*)}\]',r'\\\1 [\2]') # don't translate and remove extra bracket regex_replace(html_file,r'([|{)([^]}]*)(]|})',spacerepl) # temporarily remove spaces inside arguments regex_replace(html_file,r'([|{)([^]}]*)(]|})',dotrepl) # temporarily remove dots inside arguments regex_replace(html_file,r'([|{)([^]}]*)(]|})',commarepl) # temporarily remove commas inside arguments regex_replace(html_file,r'\\(parencites)([^\s\\\<\.\,]*)(\r|\s|\\|\<|\.|\,|\r)',r'\\\1\2\3') # don't translate contents, goes on forever regex_replace(html_file,'endparentesis','}') regex_replace(html_file,r'({|\[)([^\s\r]*)(\r|\s)',r'\1\2\3') # some arguments fell outside of span regex_replace(html_file,'&','&') regex_replace(html_file,r' \\\$',r'-\\$') # with space in front regex_replace(html_file,r'\\\$',r'\\$') regex_replace(html_file,'~','~') regex_replace(html_file,'\`',r'`') regex_replace(html_file,'\'s',"APOSMARKs") # genetive s singular regex_replace(html_file,'s\'',"sAPOSMARK") # genetive s plural regex_replace(html_file,r'([A-z])\'([A-z])',r"\1APOSMARK\2") # inside word, not likely quotation marks regex_replace(html_file,'\''," '") regex_replace(html_file,'APOSMARK','\'') regex_replace(html_file,'SPACEMARK',' ') regex_replace(html_file,'DOTMARK','.') regex_replace(html_file,'COMMAMARK','.') regex_replace(html_file,'BEGINPARENTESIS',r'\\\[') regex_replace(html_file,'ENDPARENTESIS',r'\\\]') regex_replace(html_file,'BEGINBRACKET',r'\\\{') regex_replace(html_file,'ENDBRACKET',r'\\\}') regex_replace(html_file,'SLASH',r'\\\\') regex_replace(html_file,'EMPTYBRACKET',r'{}') regex_replace(html_file,r'\.',r'.') regex_replace(html_file,',',',') # regex_replace(html_file,r'{','BEGINBRACKET') # regex_replace(html_file,r'}','ENDBRACKET') return True def import_lyx(): lyx_files = runcommand('ls *lyx')[1].split() for lyx_file in lyx_files: tex_file = lyx2tex(lyx_file) html_file = tex_file.replace('.','_') + '.html' runcommand('cp ' + tex_file + ' ' + html_file) fit_for_html(html_file) words = int(runcommand('wc -w ' + html_file)[1].split()[0]) if words / 5000 > 1: split_html(html_file,words) def exclamrepl(matchobj): return matchobj.group(0).replace('! ','!') def fix_latex(): tex_files = runcommand('ls *tex')[1].split() for tex_file in tex_files: regex_replace(tex_file,r'Translated version of ([^\.]*)\.html','') # remove header regex_replace(tex_file,r'\\index{([^}]*)}',exclamrepl) # replace exclamation marks regex_replace(tex_file,r'({|\[) ',r'\1') # remove extra spaces regex_replace(tex_file,r' (}|\])',r'\1') # remove extra spaces regex_replace(tex_file,r' \\index',r'\\index') # remove extra spaces regex_replace(tex_file,r'` ',r'`') # remove extra spaces regex_replace(tex_file,' \'','\'') # remove extra spaces regex_replace(tex_file,' ~ ','~') # remove extra spaces regex_replace(tex_file,r' \\\\$',r'\\$') # remove extra spaces regex_replace(tex_file,r'\-\\\\$',r'\\$') # remove extra spaces import sys if __name__ == "__main__": sys_argv=sys.argv command_options="i" if len(sys_argv) > 1 and sys_argv[1][0]== '-': command_options=sys_argv.pop(1)[1:] if 'h' in command_options: print print "Usage:" print sys_argv[0]+" -[hif]" print print "h)elp" print "i)import" print "f)ix output" print if 'i' in command_options: import_lyx() if 'f' in command_options: fix_latex()