#! /bin/bash # compound.sh: Find possible compund words # Copyright: Rune Kleveland 2000 (runekl@math.uio.no) # Licence: GPL # This small script tries to find those pairs or triples of words in # a Norwegian text file that should be written in one word (without # hyphens!). It reads from standard input and produces an hyphenated # list of compound words in input order. The words are hyphenated # using TeX and the language norskc. If your TeX installation doesn't # provide these hyphenation patterns, you can give the -notex option # until you feel brave enough to fix the real problem. # This can help people who has problems writing correct Norwegian to # avoid errors like `arkitekt tegnet', `matematikk lærer' etc. But # the script will not find rare compounds, and it might very well # produce output even if there are no errors (å lese bøkene/de # lesebøkene, med følelse/medfølelse, for andre/forandre). It is a # small tool, not the holy grail. # Make a files with all candidates. # compound [-p patterns] [-l language] [-e] [-h] [file] # # -p patterns Choose the patterns to hyphenate with. # The default is norskc, in which case TeX # executes \language=\l@norskc. # # -l language Choose the ispell dictionary. The default is norsk. # # -all Print words not containing a hyphen # # -s Sort the words by frequency # -notex Don't use TeX TMP=/tmp CH=a-zæøåéèêôóòçA-ZÆØÅÉÈÊÔÓÒÇ LATEX=latex LANGUAGE=norsk PATTERNS=norskc ONLYHYPHEN=true SORTFREQ=false NOTEX=false while [ $# != 0 ] do case "$1" in -p) PATTERNS=$2 shift ;; -l) LANGUAGE=$2 shift ;; -all) ONLYHYPHEN=false ;; -s) SORTFREQ=true ;; -notex) NOTEX=true ;; -) break ;; -*) echo 'Usage: compound [-p patterns] [-l language] [-all] [-s] [file]' 1>&2 exit 2 ;; *) break ;; esac shift done # Hyphenation commands result in a lot of unnessesary output. Try to # remove those first. Also remove "- commands. sed -e 's/%.*//' \ -e '/\\hyphenation[ ]*{.*}/ D' \ -e '/\\hyphenation[ ]*{[^}]*$/,/}/ D' \ -e 's/\"-//g' $@ \ | tr -cs ${CH} '\n' > ${TMP}/comp1.tmp # We make to many files in the process. Do it in $TMP. cd ${TMP} # Can someone come up with a smart sed script to make comp4 directly? cat comp1.tmp | sed -e '1 D' -e '$ a \ XXX' > comp2.tmp cat comp2.tmp | sed -e '1 D' -e '$ a \ XXX' > comp3.tmp paste -d '-\ --' comp1.tmp comp2.tmp comp1.tmp comp2.tmp comp3.tmp \ | grep -v -e '\(^.-\|-..\?$\|XXX\)' \ | tr -d '-' \ > comp4.tmp #Spellcheck the file, and find the words not in the dictionary. cat comp4.tmp | ispell -B -l -d ${LANGUAGE} > comp5.tmp diff comp4.tmp comp5.tmp | grep '<' | \ if [ "${NOTEX}" = true ] then sed -e 's/^..//' else sed -e 's/^..//' \ -e '1 i \ \\writelog{' \ -e '1000~1000 a \ }\ \\writelog{' \ -e '$ a \ }' \ > comp6.tmp rm -f comp[12345].tmp # Hyphenate the result using TeX, to filter out words like sommer (som # mer) etc, e.g. the words that are not compounds. TEXFILE='\nonstopmode \documentclass{minimal} \usepackage{t1enc} \makeatletter \language=\l@'${PATTERNS}'\lefthyphenmin=2\righthyphenmin=2 \ifx\gendiscretionary\@undefined\else \hyphenclassesstate=1\hyphenclasses=5\exhyphenclass=4 \fi \makeatother \def\writelog#1{\setbox0=\vbox{\parfillskip0pt \hsize16383.99999pt \pretolerance=-1 \tolerance=-1 \hbadness=0 \showboxdepth=0 \ #1}} \begin{document} \input{comp6.tmp} \typeout{----------}\end{document}' ${LATEX} ${TEXFILE} 2&>/dev/null sed -e '1,/^(comp6.tmp/ D' \ -e '/\\hbox/ D' \ -e '/^ *$/ D' \ -e 's/^\(\[\]\)\? *\\T1[^ ]* /*/' \ -e '/^----------/,$ c \ ' minimal.log \ | tr -d '\n)[]' \ | tr -s ' ' '\n' \ | sed -e 's/-\*//' \ -e '1 s/\*//' \ -e 's/\*/\ /' -e 's/\^\^c5/Å/g' \ -e 's/\^\^c6/Æ/g' \ -e 's/\^\^d8/Ø/g' \ -e 's/\^\^c7/Ç/g' \ -e 's/\^\^c8/È/g' \ -e 's/\^\^c9/É/g' \ -e 's/\^\^d2/Ò/g' \ -e 's/\^\^d3/Ó/g' \ -e 's/\^\^d4/Ô/g' \ -e 's/\^\^e5/å/g' \ -e 's/\^\^e6/æ/g' \ -e 's/\^\^f8/ø/g' \ -e 's/\^\^e7/ç/g' \ -e 's/\^\^e8/è/g' \ -e 's/\^\^e9/é/g' \ -e 's/\^\^f2/ò/g' \ -e 's/\^\^f3/ó/g' \ -e 's/\^\^f4/ô/g' \ -e 's/^\([^-0-9]\+\)\([0-9]\+\)/\1\ \2/' | \ if [ "${ONLYHYPHEN}" = true ] then grep -e - else cat fi fi | \ if [ "${SORTFREQ}" = true ] then sort | uniq -c | sort -n -r -s else cat fi # clean rm -f minimal.log minimal.aux minimal.dvi comp6.tmp