diff options
Diffstat (limited to 'inorsk-compwordsmaybe')
-rw-r--r-- | inorsk-compwordsmaybe | 198 |
1 files changed, 198 insertions, 0 deletions
diff --git a/inorsk-compwordsmaybe b/inorsk-compwordsmaybe new file mode 100644 index 0000000..be20481 --- /dev/null +++ b/inorsk-compwordsmaybe @@ -0,0 +1,198 @@ +#! /bin/bash + +# compound.sh: Find possible compund words +# Copyright: Rune Kleveland 2000 (runekl@math.uio.no) +# Licence: GPL + +# This small script tries to find those pairs or triples of words in +# a Norwegian text file that should be written in one word (without +# hyphens!). It reads from standard input and produces an hyphenated +# list of compound words in input order. The words are hyphenated +# using TeX and the language norskc. If your TeX installation doesn't +# provide these hyphenation patterns, you can give the -notex option +# until you feel brave enough to fix the real problem. + +# This can help people who has problems writing correct Norwegian to +# avoid errors like `arkitekt tegnet', `matematikk lærer' etc. But +# the script will not find rare compounds, and it might very well +# produce output even if there are no errors (å lese bøkene/de +# lesebøkene, med følelse/medfølelse, for andre/forandre). It is a +# small tool, not the holy grail. + + +# Make a files with all candidates. + +# compound [-p patterns] [-l language] [-e] [-h] [file] +# +# -p patterns Choose the patterns to hyphenate with. +# The default is norskc, in which case TeX +# executes \language=\l@norskc. +# +# -l language Choose the ispell dictionary. The default is norsk. +# +# -all Print words not containing a hyphen +# +# -s Sort the words by frequency +# -notex Don't use TeX + +TMP=/tmp +CH=a-zæøåéèêôóòçA-ZÆØÅÉÈÊÔÓÒÇ +LATEX=latex +LANGUAGE=norsk +PATTERNS=norskc +ONLYHYPHEN=true +SORTFREQ=false +NOTEX=false + +while [ $# != 0 ] +do + case "$1" in + -p) + PATTERNS=$2 + shift + ;; + -l) + LANGUAGE=$2 + shift + ;; + -all) + ONLYHYPHEN=false + ;; + -s) + SORTFREQ=true + ;; + -notex) + NOTEX=true + ;; + -) + break + ;; + -*) + echo 'Usage: compound [-p patterns] [-l language] [-all] [-s] [file]' + 1>&2 + exit 2 + ;; + *) + break + ;; + esac + shift +done + +# Hyphenation commands result in a lot of unnessesary output. Try to +# remove those first. Also remove "- commands. + +sed -e 's/%.*//' \ + -e '/\\hyphenation[ ]*{.*}/ D' \ + -e '/\\hyphenation[ ]*{[^}]*$/,/}/ D' \ + -e 's/\"-//g' $@ \ + | tr -cs ${CH} '\n' > ${TMP}/comp1.tmp + +# We make to many files in the process. Do it in $TMP. + +cd ${TMP} + +# Can someone come up with a smart sed script to make comp4 directly? + +cat comp1.tmp | sed -e '1 D' -e '$ a \ +XXX' > comp2.tmp + +cat comp2.tmp | sed -e '1 D' -e '$ a \ +XXX' > comp3.tmp + +paste -d '-\ +--' comp1.tmp comp2.tmp comp1.tmp comp2.tmp comp3.tmp \ + | grep -v -e '\(^.-\|-..\?$\|XXX\)' \ + | tr -d '-' \ + > comp4.tmp + +#Spellcheck the file, and find the words not in the dictionary. + +cat comp4.tmp | ispell -B -l -d ${LANGUAGE} > comp5.tmp + +diff comp4.tmp comp5.tmp | grep '<' | \ +if [ "${NOTEX}" = true ] +then + sed -e 's/^..//' +else + sed -e 's/^..//' \ + -e '1 i \ +\\writelog{' \ + -e '1000~1000 a \ +}\ +\\writelog{' \ + -e '$ a \ +}' \ + > comp6.tmp + +rm -f comp[12345].tmp + +# Hyphenate the result using TeX, to filter out words like sommer (som +# mer) etc, e.g. the words that are not compounds. + +TEXFILE='\nonstopmode +\documentclass{minimal} +\usepackage{t1enc} +\makeatletter +\language=\l@'${PATTERNS}'\lefthyphenmin=2\righthyphenmin=2 +\ifx\gendiscretionary\@undefined\else +\hyphenclassesstate=1\hyphenclasses=5\exhyphenclass=4 +\fi +\makeatother +\def\writelog#1{\setbox0=\vbox{\parfillskip0pt \hsize16383.99999pt +\pretolerance=-1 \tolerance=-1 \hbadness=0 \showboxdepth=0 \ #1}} +\begin{document} +\input{comp6.tmp} +\typeout{----------}\end{document}' + + +${LATEX} ${TEXFILE} 2&>/dev/null + +sed -e '1,/^(comp6.tmp/ D' \ + -e '/\\hbox/ D' \ + -e '/^ *$/ D' \ + -e 's/^\(\[\]\)\? *\\T1[^ ]* /*/' \ + -e '/^----------/,$ c \ + ' minimal.log \ + | tr -d '\n)[]' \ + | tr -s ' ' '\n' \ + | sed -e 's/-\*//' \ + -e '1 s/\*//' \ + -e 's/\*/\ +/' -e 's/\^\^c5/Å/g' \ + -e 's/\^\^c6/Æ/g' \ + -e 's/\^\^d8/Ø/g' \ + -e 's/\^\^c7/Ç/g' \ + -e 's/\^\^c8/È/g' \ + -e 's/\^\^c9/É/g' \ + -e 's/\^\^d2/Ò/g' \ + -e 's/\^\^d3/Ó/g' \ + -e 's/\^\^d4/Ô/g' \ + -e 's/\^\^e5/å/g' \ + -e 's/\^\^e6/æ/g' \ + -e 's/\^\^f8/ø/g' \ + -e 's/\^\^e7/ç/g' \ + -e 's/\^\^e8/è/g' \ + -e 's/\^\^e9/é/g' \ + -e 's/\^\^f2/ò/g' \ + -e 's/\^\^f3/ó/g' \ + -e 's/\^\^f4/ô/g' \ + -e 's/^\([^-0-9]\+\)\([0-9]\+\)/\1\ +\2/' | \ +if [ "${ONLYHYPHEN}" = true ] +then + grep -e - +else + cat +fi +fi | \ +if [ "${SORTFREQ}" = true ] +then + sort | uniq -c | sort -n -r -s +else + cat +fi + +# clean + +rm -f minimal.log minimal.aux minimal.dvi comp6.tmp |