summaryrefslogtreecommitdiffstats
path: root/JLanguageTool/src/resource/en/get_unc.awk
diff options
context:
space:
mode:
Diffstat (limited to 'JLanguageTool/src/resource/en/get_unc.awk')
-rw-r--r--JLanguageTool/src/resource/en/get_unc.awk102
1 files changed, 102 insertions, 0 deletions
diff --git a/JLanguageTool/src/resource/en/get_unc.awk b/JLanguageTool/src/resource/en/get_unc.awk
new file mode 100644
index 0000000..a64c36e
--- /dev/null
+++ b/JLanguageTool/src/resource/en/get_unc.awk
@@ -0,0 +1,102 @@
+#the script annotates uncountable nouns
+BEGIN {FS="\t";
+glosfile="2of12inf.txt"; #Kevin's file
+while ((getline < glosfile) > 0){
+ if ($1~/%/) {gsub(/%/,"");
+ tabela[$1]="uncount"
+ }
+ }
+english_file="english.txt"; #created temporary file
+while ((getline < english_file) > 0){
+ if (tabela[$1]=="uncount")
+ lemma[$2]="uncount"
+ if ($3=="VBG")
+ gerund[$1]="uncount"
+ }
+uncountables="uncountable.txt" #uncountable nouns
+while ((getline < uncountables) > 0)
+ if ($0!~/^#/ && $0!="") {
+ if ($0~/ /) {
+ print "Entry " $0 " contains a space. Exiting."; exit(1)
+ }
+ lemma[$0]="uncount"
+ }
+
+partlycountable = "partlycountable.txt" #partly uncountable nouns
+while ((getline < partlycountable ) > 0)
+ if ($0!~/^#/ && $0!="") {
+ if ($0~/ /) {
+ print "Entry " $0 " contains a space. Exiting."; exit(1)
+ }
+ partly_noncount[$0]="uncount"
+ }
+
+#title
+partly_noncount["sri"]="uncount"
+
+#this should be a pronoun but there ain't such
+#tag in Penn Treebank
+lemma["anything"]="uncount"
+lemma["everybody"]="uncount"
+lemma["everyone"]="uncount"
+lemma["anyone"]="uncount"
+lemma["anybody"]="uncount"
+lemma["anyplace"]="uncount"
+lemma["everything"]="uncount"
+#lemma["everyplace"]="uncount"
+lemma["someone"]="uncount"
+lemma["somebody"]="uncount"
+lemma["something"]="uncount"
+#lemma["someplace"]="uncount"
+lemma["nobody"]="uncount"
+lemma["nothing"]="uncount"
+lemma["none"]="uncount"
+lemma["whatever"]="uncount"
+}
+{if ($3=="NN") {
+word="__"$1":::"
+split($1,maybe_gerund,"-")
+if (lemma[$1]=="uncount")
+ {print $1 FS $2 FS $3":U"}
+else
+if (partly_noncount[$1]=="uncount")
+ {print $1 FS $2 FS $3":UN"}
+else
+ #fields of knowledge - used as uncountable, but also sometimes as countable
+ {if (word~/logy:::/ && word!~/aetiology|anthology|apology|doxology|etiology|hagiology|trilogy/)
+ print $1 FS $2 FS $3":UN"
+ else
+ if (word~/plasty:::/)
+ print $1 FS $2 FS $3":UN"
+ else
+ if (word~/ity:::/ && word!~/acclivity|amenity|annuity|calamity|callosity|cavity|city|commodity|dacoity|declivity|eventuality|extremity|gratuity|laity|majority|municipality|muzzle-velocity|nativity|nonentity|principality|proclivity|sorority|speciality|trinity|university|varsity/)
+ print $1 FS $2 FS $3":UN"
+ else
+ #doctrines
+ if (tolower(word)==word && word~/ism:::/ && word!~/anachronism|anglicism|aphorism|atavism|colloqualism|euphuism|gallicism|__ism+++|malapropism|mannerism|micro-organism|organism|prism|solecism|spoonerism|specialism|syllogism|truism|witticism/)
+ print $1 FS $2 FS $3":UN"
+ else
+ #disciplines etc., ending with -ics
+ if (word~/ics:::/)
+ print $1 FS $2 FS $3":U"
+ else
+ if (word~/tion:::/ && word!~/T-junction|abduction|abjection|ablution|abrogation|accentuation|acceptation|activation|adjudication|adoption|adulteration|aeration|afforestation|alleviation|alternation|amelioration|amortization|amplification|amputation|annunciation|apparition|appellation|ascription|asseveration|assignation|assumption|avocation|bastion|beatification|benediction|bifurcation|blood-relation|by-election|calcination|canalization|canonization|capitalization|capitation|caption|carnation|castration|circumnavigation|circumvention|coaling-station|codification|collation|collectivization|complication|condonation|confabulation|configuration|conflagration|conformation|confutation|congratulation|conjuration|connotation|constellation|contradistinction|contraption|conurbation|convolution|copulation|coronation|corporation|correlation|coruscation|counteraction|counterattraction|crepitation|cross-examination|cross-fertilization|cross-section|culmination|de-escalation|debarkation|decapitation|declination|deflection|defoliation|denomination|depiction|deprecation|depredation|deputation|destination|detonation|diffraction|disembarkation|disposition|disquisition|dissertation|dissimulation|edition|ejaculation|ejection|elicitation|elucidation|enunciation|equalization|eradication|evaluation|evocation|exacerbation|excoriation|execration|exhibition|exoneration|expurgation|extermination|felicitation|flagellation|fluoridation|fluoridization|fraction|fumigation|gas-station|genuflection|gestation|graduation|harmonization|idealization|idolization|implementation|imprecation|incarceration|incarnation|inception|incubation|induction|inebriation|injunction|inscription|instigation|instillation|interaction|interjection|involution|irruption|legation|libation|liberalization|loan-collection|love-potion|malediction|materialization|misapplication|misdirection|mispronunciation|nation|notion|oblation|obligation|oration|orchestration|outstation|ovation|palpitation|pay-station|perambulation|peroration|perpetration|perpetuation|personation|pigmentation|plantation|polarization|police-station|polling-station|population|potation|potion|power-station|precondition|predestination|predetermination|predilection|predisposition|prefabrication|premonition|preposition|procreation|proliferation|prorogation|protestation|putrefaction|radio-location|ramification|ratification|redisposition|rejuvenation|rendition|repudiation|reticulation|rogation|scintillation|section|segregation|signification|situation|subsection|substation|subvention|summation|superscription|syndication|titillation|ulceration|ululation|valediction|visitation|weather-station/)
+ print $1 FS $2 FS $3":UN"
+ else
+ if (word~/ness:::/ && word!~/Guinness|baroness|deaconess|eyewitness|governess|harness|lioness|marchioness|ness|patroness|villainess|wilderness/)
+ print $1 FS $2 FS $3":UN"
+ else
+ if (gerund[$1]=="uncount" || gerund[maybe_gerund[2]]=="uncount")
+ print $1 FS $2 FS $3":UN"
+ else
+ if (lemma[maybe_gerund[2]]=="uncount")
+ {print $1 FS $2 FS $3":U"}
+ else
+ if (partly_noncount[maybe_gerund[2]]=="uncount")
+ {print $1 FS $2 FS $3":UN"}
+ else
+ print $0}
+ }
+else
+ print $0
+}