aboutsummaryrefslogtreecommitdiffstats
path: root/inorsk-compwordsmaybe
blob: be20481601cf77cd1c6743326fd8b1d42ec16772 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
pre { line-height: 125%; }
td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
.highlight .hll { background-color: #ffffcc }
.highlight .c { color: #888888 } /* Comment */
.highlight .err { color: #a61717; background-color: #e3d2d2 } /* Error */
.highlight .k { color: #008800; font-weight: bold } /* Keyword */
.highlight .ch { color: #888888 } /* Comment.Hashbang */
.highlight .cm { color: #888888 } /* Comment.Multiline */
.highlight .cp { color: #cc0000; font-weight: bold } /* Comment.Preproc */
.highlight .cpf { color: #888888 } /* Comment.PreprocFile */
.highlight .c1 { color: #888888 } /* Comment.Single */
.highlight .cs { color: #cc0000; font-weight: bold; backgroun
#! /bin/bash

# compound.sh: Find possible compund words
# Copyright:   Rune Kleveland 2000 (runekl@math.uio.no)
# Licence:     GPL

# This small script tries to find those pairs or triples of words in
# a Norwegian text file that should be written in one word (without
# hyphens!).  It reads from standard input and produces an hyphenated
# list of compound words in input order.  The words are hyphenated
# using TeX and the language norskc.  If your TeX installation doesn't
# provide these hyphenation patterns, you can give the -notex option
# until you feel brave enough to fix the real problem.

# This can help people who has problems writing correct Norwegian to
# avoid errors like `arkitekt tegnet', `matematikk l�rer' etc.  But
# the script will not find rare compounds, and it might very well
# produce output even if there are no errors (� lese b�kene/de
# leseb�kene, med f�lelse/medf�lelse, for andre/forandre).  It is a
# small tool, not the holy grail.


# Make a files with all candidates.

# compound [-p patterns] [-l language] [-e] [-h] [file]
#
#    -p patterns  Choose the patterns to hyphenate with.
#                 The default is norskc, in which case TeX
#                 executes \language=\l@norskc.
#
#    -l language  Choose the ispell dictionary.  The default is norsk.
#
#    -all         Print words not containing a hyphen
#
#    -s           Sort the words by frequency
#    -notex       Don't use TeX

TMP=/tmp
CH=a-z����������A-Z����������
LATEX=latex
LANGUAGE=norsk
PATTERNS=norskc
ONLYHYPHEN=true
SORTFREQ=false
NOTEX=false

while [ $# != 0 ]
do
    case "$1" in
	-p)
	    PATTERNS=$2
	    shift
	    ;;
	-l)
	    LANGUAGE=$2
	    shift
	    ;;
	-all)
	    ONLYHYPHEN=false
	    ;;
	-s)
	    SORTFREQ=true
	    ;;
	-notex)
	    NOTEX=true
	    ;;
	-)
	    break
	    ;;
	-*)
	    echo 'Usage: compound [-p patterns] [-l language] [-all] [-s] [file]'
	      1>&2
	    exit 2
	    ;;
	*)
	    break
	    ;;
    esac
    shift
done

# Hyphenation commands result in a lot of unnessesary output.  Try to
# remove those first.  Also remove "- commands.

sed -e 's/%.*//' \
    -e '/\\hyphenation[ 	]*{.*}/ D' \
    -e '/\\hyphenation[ 	]*{[^}]*$/,/}/ D' \
    -e 's/\"-//g' $@ \
 | tr -cs ${CH} '\n' > ${TMP}/comp1.tmp

# We make to many files in the process.  Do it in $TMP.

cd ${TMP}

# Can someone come up with a smart sed script to make comp4 directly?

cat comp1.tmp   | sed -e '1 D' -e '$ a \
XXX' > comp2.tmp

cat comp2.tmp   |  sed -e '1 D' -e '$ a \
XXX' > comp3.tmp

paste -d '-\
--'  comp1.tmp comp2.tmp comp1.tmp comp2.tmp comp3.tmp \
  | grep -v -e '\(^.-\|-..\?$\|XXX\)' \
  | tr -d '-' \
  > comp4.tmp

#Spellcheck the file, and find the words not in the dictionary.

cat comp4.tmp | ispell -B -l -d ${LANGUAGE} > comp5.tmp

diff comp4.tmp comp5.tmp | grep '<' | \
if [ "${NOTEX}" = true ]
then
  sed -e 's/^..//'
else
  sed -e 's/^..//' \
      -e '1 i \
\\writelog{' \
      -e '1000~1000 a \
}\
\\writelog{' \
      -e '$ a \
}' \
  > comp6.tmp

rm -f comp[12345].tmp

# Hyphenate the result using TeX, to filter out words like sommer (som
# mer) etc, e.g. the words that are not compounds.

TEXFILE='\nonstopmode
\documentclass{minimal}
\usepackage{t1enc}
\makeatletter
\language=\l@'${PATTERNS}'\lefthyphenmin=2\righthyphenmin=2
\ifx\gendiscretionary\@undefined\else
\hyphenclassesstate=1\hyphenclasses=5\exhyphenclass=4
\fi
\makeatother
\def\writelog#1{\setbox0=\vbox{\parfillskip0pt \hsize16383.99999pt
\pretolerance=-1 \tolerance=-1 \hbadness=0 \showboxdepth=0 \ #1}}
\begin{document}
\input{comp6.tmp}
\typeout{----------}\end{document}'


${LATEX} ${TEXFILE} 2&>/dev/null

sed -e '1,/^(comp6.tmp/ D' \
    -e '/\\hbox/ D' \
    -e '/^ *$/ D' \
    -e 's/^\(\[\]\)\? *\\T1[^ ]* /*/' \
    -e '/^----------/,$ c \
 '  minimal.log \
  | tr -d '\n)[]' \
  | tr -s ' ' '\n' \
  | sed -e 's/-\*//' \
        -e '1 s/\*//' \
	-e 's/\*/\
/'      -e 's/\^\^c5/�/g' \
	-e 's/\^\^c6/�/g' \
	-e 's/\^\^d8/�/g' \
	-e 's/\^\^c7/�/g' \
	-e 's/\^\^c8/�/g' \
	-e 's/\^\^c9/�/g' \
	-e 's/\^\^d2/�/g' \
	-e 's/\^\^d3/�/g' \
	-e 's/\^\^d4/�/g' \
	-e 's/\^\^e5/�/g' \
	-e 's/\^\^e6/�/g' \
	-e 's/\^\^f8/�/g' \
	-e 's/\^\^e7/�/g' \
	-e 's/\^\^e8/�/g' \
	-e 's/\^\^e9/�/g' \
	-e 's/\^\^f2/�/g' \
	-e 's/\^\^f3/�/g' \
	-e 's/\^\^f4/�/g' \
        -e 's/^\([^-0-9]\+\)\([0-9]\+\)/\1\
\2/' | \
if [ "${ONLYHYPHEN}" = true ]
then
  grep -e -
else
  cat
fi
fi | \
if [ "${SORTFREQ}" = true ]
then
  sort | uniq -c | sort -n -r -s
else
  cat
fi

# clean

rm -f minimal.log minimal.aux minimal.dvi comp6.tmp