* WARNING - SPITBOL REQUIRES CR+LF LINE ENDINGS * THE CTAN TEAM WAS INFORMED ABOUT THIS SPECIAL NEED * * texaccents.spt * version 1.0.1 * 2022.09.17 * Windows version of texaccents.sno * guido.milanese@unicatt.it * Transforms LaTeX accents to their UTF8 equivalents * accepts both LaTeX and Bibtex codes: * examples: \'a \'{a} {\"a} and even {\'{a}} * tables of accents imported from: * https://github.com/hayk314/LaTex-handler -- Author: Hayk Aleksanyan * with my integrations, including ligatures as \ae{} * requires spitbol: https://github.com/spitbol/windows-nt ***************************************************************** * MIT License - Copyright (c) 2022 Guido Milanese * See file LICENSE in this package ***************************************************************** ****************************************************** * FUNCTIONS ****************************************************** *-- Function INITIALISE *-- opens files, imports functions define("initialise()") :(initialise_end) initialise initialise_bg * INCLUDE FILES -include "args.inc" -include "delete.inc" -include "host.inc" -include "grepl.inc" ;* essential! calls repl internally MAX = 4000000 ;* max size of input/output files = 4 megs NL = char(13) char(10) HELP_MSG = "texaccents 1.0.1" NL + "Converts legacy (La)TeX accents and ligatures to Unicode" NL + "Usage: texaccents.sno INFILE OUTFILE" NL + "--help print this help, then exit" NL + "--version print version number, then exit" NL + "Report bugs to <guido.milanese@unicatt.it>" NL + "CTAN page of the package: <https://www.ctan.org/pkg/texaccents>" VERS_MSG = "texaccents 1.0.1" NL NL + "Copyright (c) 2022 Guido Milanese" NL NL + "This is free software; see the source for copying conditions. There is NO" NL + "warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." NL nl + "Written by Guido Milanese <guido.milanese@unicatt.it>" ******************************************************* * opens input / output files ******************************************************* * Defines and checks Input & Output * help request -- exits. Both "--help" and "-help" are valid ((argv[1] ? "-help") (Terminal = HELP_MSG)) :s(end) * version request -- exits. Both "--version" and "-version" are valid ((argv[1] ? "-version") (Terminal = VERS_MSG)) :s(end) * Normal input-output with error messages. * Error messages are used by MAIN if necessary (~(infile = argv[1]) ((Terminal = HELP_MSG) (ERROR_MSG = "Input error"))) :s(freturn) (~(outfile = argv[2]) ((Terminal = HELP_MSG) (ERROR_MSG = "Output error"))) :s(freturn) INPUT('r_f',1, infile '[-R' MAX ']') :f(freturn) Terminal = "Reading text from " Infile OUTPUT('w_f',2,Outfile '[-r' MAX ']') :f(freturn) initialise_rt initialise = r_f :(return) initialise_end ************************************************************************************* *-- Function NORMACC *-- Changes TeX accents without curly brackets to the standard form *-- \'a --> \'{a} *-- and replaces ligatures such as \ae --> æ | \oe --> Å“ *-- requires GREPL ************************************************************************************* define("normacc(Pass)") P_shortacc = notany('{') + ((('\' any("'`" '"' "^=~.")) . Pre_shortacc) + (any(&ucase &lcase) . Letter_shortacc)) . V_shortacc :(normacc_end) normacc normacc_bg Pass ? P_shortacc :f(normacc_lg) Pass ? V_shortacc = Pre_shortacc '{' Letter_shortacc '}' :(normacc_bg) normacc_lg Pass2 = grepl(Pass, + "\ae{} \oe{} \AE{} \OE{} \dh{} \DH{} \th{} \TH{} \o{} \O{} \l{} \L{} ", + "æ Å“ Æ Å’ ð à þ Þ ø Ø Å‚ Å ") Terminal = "Ligatures tranformed" normacc_rt normacc = Pass2 :(return) normacc_end * * The pattern to normalise accents -- as \'a -- is defined as follows: * 1. there must be no '{' before backslash: otherwise the pattern would trap also * BibTex codes such as {\'a} * 2. find a backslash followed by any char of the list. Saved as "Pre_sortacc" * 3. find the accented letter. Saved as Letter_shortacc * 4. The whole structure is saved as V_shortacc * The programme will save the whole structure and substitute it in the text with: * Pre_shortacc (as \') + '{' + Letter_shortacc + '}' * in this way the short form -- \'a -- is transformed to the canonical \'{a} * * Ligatures and special chars are simply replaced with GREPL. ************************************************************************************* *-- Function CLEANACC *-- Substitutes plain letters to Unicode accents, using the sets provides by TRANSACC *-- \'{a} will output á ************************************************************************************* define("cleanacc(Text,Acc1,Acc2)") :(cleanacc_end) * Acc1 is the original LaTeX code e.g. \"{a} * Acc2, at the time of calling the function, is e.g. \"{ä} cleanacc cleanacc_bg * removes paretheses: \"{ä} --> \"ä Acc2 = delete(Acc2,"{}") * moves 2 chars and saves what remains, i.e. the letter * \"ä --> ä Acc2 ? len(2) (rem . Acc2) * in text replaces e.g. \"{a} with ä * the substitution is done ONCE for the whole text Text = repl(Text,Acc1,Acc2) Terminal = "Working on " Acc1 cleanacc_rt cleanacc = Text :(return) cleanacc_end ******************************************************* *-- Function TRANSACC *-- transforms LaTeX/Bibtex accents to Unicode ******************************************************* define("transacc(Pass)") * Pass is the input text * * Accents used by Latex, e.g. \'{a} = acute accent over 'a' Lat_acc = + any('"' "'" "`" 'H' '^' 'v' 'u' 'c' '.' 'd' 'k' '~' '=' 'b' 'r' ) + . V_lat_acc * * The pattern says: * '\' + Lat_acc OR ('{\' + Lat_acc + '{' OR nothing) * any letter, saved as C_noacc + '}' + '}' OR nothing * Lat_acc is already defined above and saved ad V_lat_acc * Examples \'{a} -- {\'a} -- {\'{a}} (\'a was already "normalised") P_lat_code = + ( + (('\' Lat_acc '{') | ('{\' Lat_acc ('{' | '') )) + (any(&lcase &ucase) . C_noacc) '}' ('}' | '') ) . V_lat_code :(transacc_end) transacc transacc_bg latcode_bg V_lat_code = ;* cleans previous value V_lat_code_2 = ;* cleans previous value * Scans input text to locate the pattern * if no other LaTeX accents, fails and returns to main * if yes, goes to the section that transforms this accent. * e.g. if the accent is umlaut, will jump to TR" * After each transformation we have: * 1. the original code, e.g. \"{a} or \"a or {\"a} * 2. the transformed code, e.g. \"{ä} or \"ä or {\"ä} * Each section calls CLEANCLODE * CLEANCODE will remove all the {}\ and the LaTeX accent * and substitute in the input text e.g. \"{ä} with ä Pass ? P_lat_code :f(latcode_nd) :($('TR' V_lat_acc)) TR" ;* umlaut V_lat_code_2 = grepl(V_lat_code, + "A a B b C c E e H h I i K k M m N n O o P p Q q S s T t U u V v W w X x Y y Z z ", + "Ä ä B̈ b̈ C̈ c̈ Ë ë Ḧ ḧ à ï K̈ k̈ M̈ m̈ N̈ n̈ Ö ö P̈ p̈ Q̈ q̈ S̈ s̈ T̈ ẗ Ü ü V̈ v̈ Ẅ ẅ Ẍ ẠŸ ÿ Z̈ z̈ ") Pass = cleanacc(Pass,V_lat_code,V_lat_code_2) :(latcode_bg) TR' ;* acute V_lat_code_2 = grepl(V_lat_code, + "A a B b C c D d E e F f G g H h I i J j K k L l M m N n O o P p Q q R r S s T t U u V v W w X x Y y Z z ", + "à á BÌ bÌ Ä† ć DÌ dÌ Ã‰ é FÌ fÌ Ç´ ǵ HÌ hÌ Ã Ã JÌ È·Ì á¸° ḱ Ĺ ĺ Ḿ ḿ Ń Å„ Ó ó á¹” ṕ QÌ qÌ Å” Å• Åš Å› TÌ tÌ Ãš ú VÌ vÌ áº‚ ẃ XÌ xÌ Ã Ã½ Ź ź ") Pass = cleanacc(Pass,V_lat_code,V_lat_code_2) :(latcode_bg) TRH ;* double acute - \H{o} Hungarian V_lat_code_2 = grepl(V_lat_code, + "A a E e I i M m O o U u ", + "AÌ‹ aÌ‹ EÌ‹ eÌ‹ IÌ‹ iÌ‹ MÌ‹ mÌ‹ Å Å‘ Ű ű ") Pass = cleanacc(Pass,V_lat_code,V_lat_code_2) :(latcode_bg) TR` ;* grave V_lat_code_2 = grepl(V_lat_code, + "A a Æ æ E e H h I i K k M m N n O o R r S s T t U u V v W w X x Y y Z z ", + "À à Æ̀ æ̀ È è HÌ€ hÌ€ ÃŒ ì KÌ€ kÌ€ MÌ€ mÌ€ Ǹ ǹ Ã’ ò RÌ€ rÌ€ SÌ€ sÌ€ TÌ€ tÌ€ Ù ù VÌ€ vÌ€ Ẁ ẠXÌ€ xÌ€ Ỳ ỳ ZÌ€ zÌ€ ") Pass = cleanacc(Pass,V_lat_code,V_lat_code_2) :(latcode_bg) TR^ ;* circumflex \^{o} V_lat_code_2 = grepl(V_lat_code, + "A a B b C c D d E e G g H h I i J j K k L l M m N n O o R r S s T t U u V v W w X x Y y Z z ", + " â BÌ‚ bÌ‚ Ĉ ĉ DÌ‚ dÌ‚ Ê ê Äœ Ä Ä¤ Ä¥ ÃŽ î Ä´ ĵ KÌ‚ kÌ‚ LÌ‚ lÌ‚ MÌ‚ mÌ‚ NÌ‚ nÌ‚ Ô ô RÌ‚ rÌ‚ Åœ Å TÌ‚ tÌ‚ Û û VÌ‚ vÌ‚ Å´ ŵ XÌ‚ xÌ‚ Ŷ Å· Ạẑ ") Pass = cleanacc(Pass,V_lat_code,V_lat_code_2) :(latcode_bg) TRv ;* caron hraceck \v{s} * For accents that are also normal letters * we need to restore the accent to its original value * This applies to: v u c d k b r V_lat_code_2 = grepl(V_lat_code, + "A a B b C c D d E e F f G g H h I i J j K k L l M m N n O o P p Q q R r S s T t U u V v W w X x Y y Z z ", + "Ç ÇŽ BÌŒ bÌŒ ÄŒ Ä ÄŽ Ä Äš Ä› FÌŒ fÌŒ Ǧ ǧ Èž ÈŸ Ç Ç JÌŒ ǰ Ǩ Ç© Ľ ľ MÌŒ mÌŒ Ň ň Ç‘ Ç’ PÌŒ pÌŒ QÌŒ qÌŒ Ř Å™ Å Å¡ Ť Å¥ Ç“ Ç” VÌŒ vÌŒ WÌŒ wÌŒ XÌŒ xÌŒ YÌŒ yÌŒ Ž ž ") V_lat_code_2 = repl(V_lat_code_2,"\vÌŒ","\v") Pass = cleanacc(Pass,V_lat_code,V_lat_code_2) :(latcode_bg) TRu ;* breve \u{o} V_lat_code_2 = grepl(V_lat_code, + "A a C c E e G g I i K k M m N n O o P p R r T t U u V v X x Y y ", + "Ä‚ ă C̆ c̆ Ä” Ä• Äž ÄŸ Ĭ Ä K̆ k̆ M̆ m̆ N̆ n̆ ÅŽ Å P̆ p̆ R̆ r̆ T̆ t̆ Ŭ Å V̆ v̆ X̆ x̆ Y̆ y̆ ") V_lat_code_2 = repl(V_lat_code_2,"\Å","\u") Pass = cleanacc(Pass,V_lat_code,V_lat_code_2) :(latcode_bg) TRc ;* cedilla \c{c} V_lat_code_2 = grepl(V_lat_code, + "A a B b C c D d E e G g H h I i K k L l M m N n O o Q q R r S s T t U u X x Z z ", + "A̧ a̧ B̧ b̧ Ç ç Ḡḑ Ȩ È© Ä¢ Ä£ Ḩ ḩ I̧ i̧ Ķ Ä· Ä» ļ M̧ m̧ Å… ņ O̧ o̧ Q̧ q̧ Å– Å— Åž ÅŸ Å¢ Å£ U̧ u̧ X̧ x̧ Z̧ z̧ ") V_lat_code_2 = repl(V_lat_code_2,"\ç","\c") Pass = cleanacc(Pass,V_lat_code,V_lat_code_2) :(latcode_bg) TR. ;* dot \.{o} V_lat_code_2 = grepl(V_lat_code, + "A a B b C c D d E e F f G g H h I i K k L l M m N n O o P p Q q R r S s T t U u V v W w X x Y y Z z ", + "Ȧ ȧ Ḃ ḃ ÄŠ Ä‹ Ḋ ḋ Ä– Ä— Ḟ ḟ Ä Ä¡ Ḣ ḣ İ i̇̀ K̇ k̇ L̇ l̇ á¹€ ṠṄ á¹… È® ȯ á¹– á¹— Q̇ q̇ Ṙ á¹™ Ṡṡ Ṫ ṫ U̇ u̇ V̇ v̇ Ẇ ẇ Ẋ ẋ Ẏ ẠŻ ż ") Pass = cleanacc(Pass,V_lat_code,V_lat_code_2) :(latcode_bg) TRd ;* dot under the letter \d{u} V_lat_code_2 = grepl(V_lat_code, + "A a B b C c D d E e F f G g H h I i J j K k L l M m N n O o P p Q q R r S s T t U u V v W w X x Y y Z z ", + "Ạạ Ḅ ḅ CÌ£ cÌ£ Ḍ ḠẸ ẹ FÌ£ fÌ£ GÌ£ gÌ£ Ḥ ḥ Ị ị JÌ£ jÌ£ Ḳ ḳ Ḷ ḷ Ṃ ṃ Ṇ ṇ Ọ á» PÌ£ pÌ£ QÌ£ qÌ£ Ṛ á¹› á¹¢ á¹£ Ṭ ṠỤ ụ á¹¾ ṿ Ẉ ẉ XÌ£ xÌ£ á»´ ỵ Ẓ ẓ ") V_lat_code_2 = repl(V_lat_code_2,"\á¸","\d") Pass = cleanacc(Pass,V_lat_code,V_lat_code_2) :(latcode_bg) TRk ;* ogonek \k{a} V_lat_code_2 = grepl(V_lat_code, + "A a E e I i O o U u Y y ", + "Ä„ Ä… Ę Ä™ Ä® į Ǫ Ç« Ų ų Y̨ y̨ ") Pass = cleanacc(Pass,V_lat_code,V_lat_code_2) :(latcode_bg) TR~ ;* tilde \~o V_lat_code_2 = grepl(V_lat_code, + "A a E e I i N n O o U u V v Y y ", + "à ã Ẽ ẽ Ĩ Ä© Ñ ñ Õ õ Ũ Å© á¹¼ á¹½ Ỹ ỹ ") Pass = cleanacc(Pass,V_lat_code,V_lat_code_2) :(latcode_bg) TR= ;* macron \=a \={a} V_lat_code_2 = grepl(V_lat_code, + "A a B b C c D d E e G g I i J j K k M m N n O o P p Q q R r S s T t U u V v W w X x Y y Z z ", + "Ä€ Ä BÌ„ bÌ„ CÌ„ cÌ„ DÌ„ dÌ„ Ä’ Ä“ Ḡḡ Ī Ä« JÌ„ jÌ„ KÌ„ kÌ„ MÌ„ mÌ„ NÌ„ nÌ„ ÅŒ Å PÌ„ pÌ„ QÌ„ qÌ„ RÌ„ rÌ„ SÌ„ sÌ„ TÌ„ tÌ„ Ū Å« VÌ„ vÌ„ WÌ„ wÌ„ XÌ„ xÌ„ Ȳ ȳ ZÌ„ zÌ„ ") Pass = cleanacc(Pass,V_lat_code,V_lat_code_2) :(latcode_bg) TRb ;* bar under the letter \b{a} V_lat_code_2 = grepl(V_lat_code, + "A a B b C c D d E e G g H h I i J j K k L l M m N n O o P p R r S s T t U u X x Y y Z z ", + "A̱ a̱ Ḇ ḇ C̱ c̱ Ḏ ḠE̱ e̱ G̱ g̱ H̱ ẖ I̱ i̱ J̱ j̱ Ḵ ḵ Ḻ ḻ M̱ m̱ Ṉ ṉ O̱ o̱ P̱ p̱ Ṟ ṟ S̱ s̱ á¹® ṯ U̱ u̱ X̱ x̱ Y̱ y̱ Ẕ ẕ ") V_lat_code_2 = repl(V_lat_code_2,"\ḇ","\b") Pass = cleanacc(Pass,V_lat_code,V_lat_code_2) :(latcode_bg) TRr ;* ring over the letter \r{a} V_lat_code_2 = grepl(V_lat_code, + "A a B b C c D d E e F f G g H h I i J j K k L l M m N n O o P p Q q r R S s U u V v W w X x Y y Z z ", + "Ã… Ã¥ BÌŠ bÌŠ CÌŠ cÌŠ DÌŠ dÌŠ EÌŠ eÌŠ FÌŠ fÌŠ GÌŠ gÌŠ HÌŠ hÌŠ IÌŠ iÌŠ JÌŠ jÌŠ KÌŠ kÌŠ LÌŠ lÌŠ MÌŠ mÌŠ NÌŠ nÌŠ OÌŠ oÌŠ PÌŠ pÌŠ QÌŠ qÌŠ rÌŠ RÌŠ SÌŠ sÌŠ Å® ů VÌŠ vÌŠ WÌŠ ẘ XÌŠ xÌŠ YÌŠ ẙ ZÌŠ zÌŠ ") V_lat_code_2 = repl(V_lat_code_2,"\rÌŠ","\r") Pass = cleanacc(Pass,V_lat_code,V_lat_code_2) :(latcode_bg) latcode_nd transacc = Pass :(return) transacc_end ******************************************************* * MAIN ******************************************************* (~(texfile = initialise()) (terminal = "Could not start programme - " ERROR_MSG)) :s(end) Texfile = normacc(Texfile) (((w_f = transacc(texfile)) (Terminal = "Done. File " Outfile " written")), + (Terminal = "Conversion failed")) END