#! /bin/bash
# compound.sh: Find possible compund words
# Copyright: Rune Kleveland 2000 (runekl@math.uio.no)
# Licence: GPL
# This small script tries to find those pairs or triples of words in
# a Norwegian text file that should be written in one word (without
# hyphens!). It reads from standard input and produces an hyphenated
# list of compound words in input order. The words are hyphenated
# using TeX and the language norskc. If your TeX installation doesn't
# provide these hyphenation patterns, you can give the -notex option
# until you feel brave enough to fix the real problem.
# This can help people who has problems writing correct Norwegian to
# avoid errors like `arkitekt tegnet', `matematikk l�rer' etc. But
# the script will not find rare compounds, and it might very well
# produce output even if there are no errors (� lese b�kene/de
# leseb�kene, med f�lelse/medf�lelse, for andre/forandre). It is a
# small tool, not the holy grail.
# Make a files with all candidates.
# compound [-p patterns] [-l language] [-e] [-h] [file]
#
# -p patterns Choose the patterns to hyphenate with.
# The default is norskc, in which case TeX
# executes \language=\l@norskc.
#
# -l language Choose the ispell dictionary. The default is norsk.
#
# -all Print words not containing a hyphen
#
# -s Sort the words by frequency
# -notex Don't use TeX
TMP=/tmp
CH=a-z����������A-Z����������
LATEX=latex
LANGUAGE=norsk
PATTERNS=norskc
ONLYHYPHEN=true
SORTFREQ=false
NOTEX=false
while [ $# != 0 ]
do
case "$1" in
-p)
PATTERNS=$2
shift
;;
-l)
LANGUAGE=$2
shift
;;
-all)
ONLYHYPHEN=false
;;
-s)
SORTFREQ=true
;;
-notex)
NOTEX=true
;;
-)
break
;;
-*)
echo 'Usage: compound [-p patterns] [-l language] [-all] [-s] [file]'
1>&2
exit 2
;;
*)
break
;;
esac
shift
done
# Hyphenation commands result in a lot of unnessesary output. Try to
# remove those first. Also remove "- commands.
sed -e 's/%.*//' \
-e '/\\hyphenation[ ]*{.*}/ D' \
-e '/\\hyphenation[ ]*{[^}]*$/,/}/ D' \
-e 's/\"-//g' $@ \
| tr -cs ${CH} '\n' > ${TMP}/comp1.tmp
# We make to many files in the process. Do it in $TMP.
cd ${TMP}
# Can someone come up with a smart sed script to make comp4 directly?
cat comp1.tmp | sed -e '1 D' -e '$ a \
XXX' > comp2.tmp
cat comp2.tmp | sed -e '1 D' -e '$ a \
XXX' > comp3.tmp
paste -d '-\
--' comp1.tmp comp2.tmp comp1.tmp comp2.tmp comp3.tmp \
| grep -v -e '\(^.-\|-..\?$\|XXX\)' \
| tr -d '-' \
> comp4.tmp
#Spellcheck the file, and find the words not in the dictionary.
cat comp4.tmp | ispell -B -l -d ${LANGUAGE} > comp5.tmp
diff comp4.tmp comp5.tmp | grep '<' | \
if [ "${NOTEX}" = true ]
then
sed -e 's/^..//'
else
sed -e 's/^..//' \
-e '1 i \
\\writelog{' \
-e '1000~1000 a \
}\
\\writelog{' \
-e '$ a \
}' \
> comp6.tmp
rm -f comp[12345].tmp
# Hyphenate the result using TeX, to filter out words like sommer (som
# mer) etc, e.g. the words that are not compounds.
TEXFILE='\nonstopmode
\documentclass{minimal}
\usepackage{t1enc}
\makeatletter
\language=\l@'${PATTERNS}'\lefthyphenmin=2\righthyphenmin=2
\ifx\gendiscretionary\@undefined\else
\hyphenclassesstate=1\hyphenclasses=5\exhyphenclass=4
\fi
\makeatother
\def\writelog#1{\setbox0=\vbox{\parfillskip0pt \hsize16383.99999pt
\pretolerance=-1 \tolerance=-1 \hbadness=0 \showboxdepth=0 \ #1}}
\begin{document}
\input{comp6.tmp}
\typeout{----------}\end{document}'
${LATEX} ${TEXFILE} 2&>/dev/null
sed -e '1,/^(comp6.tmp/ D' \
-e '/\\hbox/ D' \
-e '/^ *$/ D' \
-e 's/^\(\[\]\)\? *\\T1[^ ]* /*/' \
-e '/^----------/,$ c \
' minimal.log \
| tr -d '\n)[]' \
| tr -s ' ' '\n' \
| sed -e 's/-\*//' \
-e '1 s/\*//' \
-e 's/\*/\
/' -e 's/\^\^c5/�/g' \
-e 's/\^\^c6/�/g' \
-e 's/\^\^d8/�/g' \
-e 's/\^\^c7/�/g' \
-e 's/\^\^c8/�/g' \
-e 's/\^\^c9/�/g' \
-e 's/\^\^d2/�/g' \
-e 's/\^\^d3/�/g' \
-e 's/\^\^d4/�/g' \
-e 's/\^\^e5/�/g' \
-e 's/\^\^e6/�/g' \
-e 's/\^\^f8/�/g' \
-e 's/\^\^e7/�/g' \
-e 's/\^\^e8/�/g' \
-e 's/\^\^e9/�/g' \
-e 's/\^\^f2/�/g' \
-e 's/\^\^f3/�/g' \
-e 's/\^\^f4/�/g' \
-e 's/^\([^-0-9]\+\)\([0-9]\+\)/\1\
\2/' | \
if [ "${ONLYHYPHEN}" = true ]
then
grep -e -
else
cat
fi
fi | \
if [ "${SORTFREQ}" = true ]
then
sort | uniq -c | sort -n -r -s
else
cat
fi
# clean
rm -f minimal.log minimal.aux minimal.dvi comp6.tmp