inorsk-hyphenmaybe


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229

: Use /bin/bash

# Copyright: Rune Kleveland (2000) <runekl@math.uio.no>
# License  : GPL
# Version  : 0.9

# This script takes a (TeX) file as argument and prints every word not
# in the Norwegian dictionary hyphenated by TeX.  The parsing of the
# file is done by ispell.

# It can be used to find out which words the Norwegian patterns
# hyphenates incorrectly, because the Norwegian patterns I have
# generated should hyphenate every word in the norsk and nynorsk
# dictionaries correctly.  The incorrecly hyphenated words should be
# included in a \hyphenation command.  If there already are
# hyphenation commands in the TeX document, that is taken into
# consideration unless the -noparse option is given.

# If you tell me about incorrectly hyphenated common words, I might
# fix it in a future version.

# If multi level hyphenation is available, it is used.  Unfortunately
# this requires an experimental TeX today, and few people seem to have
# that.

# nohyphinsecure [-p patterns] [-l language] [-e] [-h] [file]
#
#    -p patterns  Choose the patterns to hyphenate with.
#                 The default is norsk, in which case TeX
#                 executes \language=\l@norsk.
#
#    -l language  Choose the ispell dictionary.  The default is norsk.
#
#    -ll language Filter through extra dictionary.  The default is nynorsk.
#                 Use false to avoid filtering.
#
#    -e           Throw away all english words.
#
#    -h           Print only words that does contain a hyphen
#
#    -nosort      Do not sort the words, but output in the order they appear
#
#    -nroff       Ispell parsing keyword.  Overrride default TeX.
#
#    -noparse     Don't try to find hyphenation commands in the input file.


TMP=/tmp
ISPELLMODE=tex
LATEX=latex
LANGUAGE=norsk
LLANGUAGE=nynorsk
CH=a-z����������A-Z����������
PATTERNS=${LANGUAGE}
IGNOREENGLISH=false
FORMAT=-t
ONLYHYPHEN=false
PARSEFORHYPH=true
SORTING=true

while [ $# != 0 ]
do
    case "$1" in
	-p)
	    PATTERNS=$2
	    shift
	    ;;
	-l)
	    LANGUAGE=$2
	    shift
	    ;;
	-ll)
	    LLANGUAGE=$2
	    shift
	    ;;
	-e)
	    IGNOREENGLISH=true
	    ;;
	-h)
	    ONLYHYPHEN=true
	    ;;
	-nroff)
	    FORMAT=-n
	    ;;
	-noparse)
	    PARSEFORHYPH=false
	    ;;
	-nosort)
	    SORTING=false
	    ;;
	-)
	    break
	    ;;
	-*)
	    echo 'Usage: nohyphinsecure [-p patterns] [-l dictionary] [-ll dictionary] [-e] [-h] [file] ...' \
	      1>&2
	    exit 2
	    ;;
	*)
	    break
	    ;;
    esac
    shift
done


# Parse for \hyphenation command.  Assumes you use TeX gently.

if [ ${PARSEFORHYPH} = true ]
then
  cat $@ > ${TMP}/hyphen0.tmp
  sed -e 's/%.*//' \
      -e '/\\hyphenation[ 	]*{.*}/ p' \
      -e '/\\hyphenation[ 	]*{[^}]*$/,/}/! D' ${TMP}/hyphen0.tmp \
    | sed -e 's/^.\+\(\\hyphenation[ 	]*{\)/\1/' \
    > ${TMP}/hyphen1.tmp
  cat ${TMP}/hyphen0.tmp
else
  rm -f ${TMP}/hyphen1.tmp
  cat $@
fi | \
if [ ${LANGUAGE} = false ]
then
  tr -cs ${CH} '\n'
elif [ ${LLANGUAGE} = false ]
then
  tr '.,;:' '    ' \
    | ispell -B -l -d ${LANGUAGE}  ${FORMAT}
else
  tr '.,;:' '    ' \
    | ispell -B -l -d ${LANGUAGE}  ${FORMAT} \
    | ispell -B -l -d ${LLANGUAGE} ${FORMAT}
fi \
  > ${TMP}/hyphen2.tmp

cd ${TMP}

if [ "${IGNOREENGLISH}" = true ]
then
  (grep -v '[^a-zA-Z]' hyphen2.tmp | ispell -l -d english; \
   grep '[^a-zA-Z]' hyphen2.tmp)
else
  cat hyphen2.tmp
fi | \
if [ $SORTING = true ]
then
  sort \
  | uniq -c \
  | sort -n -r -s
else
  cat
fi \
  | sed -e '1 i \
\\writelog{' \
      -e '1000~1000 a \
}\
\\writelog{' \
      -e '$ a \
}' \
  > hyphen3.tmp

TEXFILE='\nonstopmode
\documentclass{minimal}
\usepackage{t1enc}
\makeatletter
\language=\l@'${PATTERNS}'\lefthyphenmin=2\righthyphenmin=2
\ifx\gendiscretionary\@undefined\else
\hyphenclassesstate=1\hyphenclasses=5\exhyphenclass=4
\fi
\InputIfFileExists{./hyphen1.tmp}{}
\makeatother
\def\writelog#1{\setbox0=\vbox{\parfillskip0pt \hsize16383.99999pt
\pretolerance=-1 \tolerance=-1 \hbadness=0 \showboxdepth=0 \ #1}}
\begin{document}
\input{hyphen3.tmp}
\typeout{----------}
\end{document}'


${LATEX} ${TEXFILE} 2&>/dev/null

rm -f hyphen[0123].tmp

# Parse the log file

sed -e '1,/(hyphen3.tmp/ D' \
    -e '/\\hbox/ D' \
    -e '/^ *$/ D' \
    -e 's/^\(\[\]\)\? *\\T1[^ ]* /*/' \
    -e '/^----------/,$ c \
 '  minimal.log \
  | tr -d '\n)' \
  | tr -s ' ' '\n' \
  | sed -e 's/-\*//' \
        -e '1 s/\*//' \
        -e 's/\*/\
/'      -e '/^ *$/ D' \
	-e 's/\^\^c5/�/g' \
	-e 's/\^\^c6/�/g' \
	-e 's/\^\^d8/�/g' \
	-e 's/\^\^c7/�/g' \
	-e 's/\^\^c8/�/g' \
	-e 's/\^\^c9/�/g' \
	-e 's/\^\^d2/�/g' \
	-e 's/\^\^d3/�/g' \
	-e 's/\^\^d4/�/g' \
	-e 's/\^\^e5/�/g' \
	-e 's/\^\^e6/�/g' \
	-e 's/\^\^f8/�/g' \
	-e 's/\^\^e7/�/g' \
	-e 's/\^\^e8/�/g' \
	-e 's/\^\^e9/�/g' \
	-e 's/\^\^f2/�/g' \
	-e 's/\^\^f3/�/g' \
	-e 's/\^\^f4/�/g' | \
if [ $SORTING = true ]
then
  sed -e N -e 's/\n/ /'
else
  cat
fi | \
if [ "${ONLYHYPHEN}" = true ]
then
  grep '\-'
else
  cat
fi

rm -f minimal.log minimal.aux minimal.dvi