blob: a64c36e2c80f461e96b7c86a17c4cd040c5f252b (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
|
#the script annotates uncountable nouns
BEGIN {FS="\t";
glosfile="2of12inf.txt"; #Kevin's file
while ((getline < glosfile) > 0){
if ($1~/%/) {gsub(/%/,"");
tabela[$1]="uncount"
}
}
english_file="english.txt"; #created temporary file
while ((getline < english_file) > 0){
if (tabela[$1]=="uncount")
lemma[$2]="uncount"
if ($3=="VBG")
gerund[$1]="uncount"
}
uncountables="uncountable.txt" #uncountable nouns
while ((getline < uncountables) > 0)
if ($0!~/^#/ && $0!="") {
if ($0~/ /) {
print "Entry " $0 " contains a space. Exiting."; exit(1)
}
lemma[$0]="uncount"
}
partlycountable = "partlycountable.txt" #partly uncountable nouns
while ((getline < partlycountable ) > 0)
if ($0!~/^#/ && $0!="") {
if ($0~/ /) {
print "Entry " $0 " contains a space. Exiting."; exit(1)
}
partly_noncount[$0]="uncount"
}
#title
partly_noncount["sri"]="uncount"
#this should be a pronoun but there ain't such
#tag in Penn Treebank
lemma["anything"]="uncount"
lemma["everybody"]="uncount"
lemma["everyone"]="uncount"
lemma["anyone"]="uncount"
lemma["anybody"]="uncount"
lemma["anyplace"]="uncount"
lemma["everything"]="uncount"
#lemma["everyplace"]="uncount"
lemma["someone"]="uncount"
lemma["somebody"]="uncount"
lemma["something"]="uncount"
#lemma["someplace"]="uncount"
lemma["nobody"]="uncount"
lemma["nothing"]="uncount"
lemma["none"]="uncount"
lemma["whatever"]="uncount"
}
{if ($3=="NN") {
word="__"$1":::"
split($1,maybe_gerund,"-")
if (lemma[$1]=="uncount")
{print $1 FS $2 FS $3":U"}
else
if (partly_noncount[$1]=="uncount")
{print $1 FS $2 FS $3":UN"}
else
#fields of knowledge - used as uncountable, but also sometimes as countable
{if (word~/logy:::/ && word!~/aetiology|anthology|apology|doxology|etiology|hagiology|trilogy/)
print $1 FS $2 FS $3":UN"
else
if (word~/plasty:::/)
print $1 FS $2 FS $3":UN"
else
if (word~/ity:::/ && word!~/acclivity|amenity|annuity|calamity|callosity|cavity|city|commodity|dacoity|declivity|eventuality|extremity|gratuity|laity|majority|municipality|muzzle-velocity|nativity|nonentity|principality|proclivity|sorority|speciality|trinity|university|varsity/)
print $1 FS $2 FS $3":UN"
else
#doctrines
if (tolower(word)==word && word~/ism:::/ && word!~/anachronism|anglicism|aphorism|atavism|colloqualism|euphuism|gallicism|__ism+++|malapropism|mannerism|micro-organism|organism|prism|solecism|spoonerism|specialism|syllogism|truism|witticism/)
print $1 FS $2 FS $3":UN"
else
#disciplines etc., ending with -ics
if (word~/ics:::/)
print $1 FS $2 FS $3":U"
else
if (word~/tion:::/ && word!~/T-junction|abduction|abjection|ablution|abrogation|accentuation|acceptation|activation|adjudication|adoption|adulteration|aeration|afforestation|alleviation|alternation|amelioration|amortization|amplification|amputation|annunciation|apparition|appellation|ascription|asseveration|assignation|assumption|avocation|bastion|beatification|benediction|bifurcation|blood-relation|by-election|calcination|canalization|canonization|capitalization|capitation|caption|carnation|castration|circumnavigation|circumvention|coaling-station|codification|collation|collectivization|complication|condonation|confabulation|configuration|conflagration|conformation|confutation|congratulation|conjuration|connotation|constellation|contradistinction|contraption|conurbation|convolution|copulation|coronation|corporation|correlation|coruscation|counteraction|counterattraction|crepitation|cross-examination|cross-fertilization|cross-section|culmination|de-escalation|debarkation|decapitation|declination|deflection|defoliation|denomination|depiction|deprecation|depredation|deputation|destination|detonation|diffraction|disembarkation|disposition|disquisition|dissertation|dissimulation|edition|ejaculation|ejection|elicitation|elucidation|enunciation|equalization|eradication|evaluation|evocation|exacerbation|excoriation|execration|exhibition|exoneration|expurgation|extermination|felicitation|flagellation|fluoridation|fluoridization|fraction|fumigation|gas-station|genuflection|gestation|graduation|harmonization|idealization|idolization|implementation|imprecation|incarceration|incarnation|inception|incubation|induction|inebriation|injunction|inscription|instigation|instillation|interaction|interjection|involution|irruption|legation|libation|liberalization|loan-collection|love-potion|malediction|materialization|misapplication|misdirection|mispronunciation|nation|notion|oblation|obligation|oration|orchestration|outstation|ovation|palpitation|pay-station|perambulation|peroration|perpetration|perpetuation|personation|pigmentation|plantation|polarization|police-station|polling-station|population|potation|potion|power-station|precondition|predestination|predetermination|predilection|predisposition|prefabrication|premonition|preposition|procreation|proliferation|prorogation|protestation|putrefaction|radio-location|ramification|ratification|redisposition|rejuvenation|rendition|repudiation|reticulation|rogation|scintillation|section|segregation|signification|situation|subsection|substation|subvention|summation|superscription|syndication|titillation|ulceration|ululation|valediction|visitation|weather-station/)
print $1 FS $2 FS $3":UN"
else
if (word~/ness:::/ && word!~/Guinness|baroness|deaconess|eyewitness|governess|harness|lioness|marchioness|ness|patroness|villainess|wilderness/)
print $1 FS $2 FS $3":UN"
else
if (gerund[$1]=="uncount" || gerund[maybe_gerund[2]]=="uncount")
print $1 FS $2 FS $3":UN"
else
if (lemma[maybe_gerund[2]]=="uncount")
{print $1 FS $2 FS $3":U"}
else
if (partly_noncount[maybe_gerund[2]]=="uncount")
{print $1 FS $2 FS $3":UN"}
else
print $0}
}
else
print $0
}
|