summaryrefslogtreecommitdiffstats
path: root/JLanguageTool/src/resource/en/remap.awk
diff options
context:
space:
mode:
Diffstat (limited to 'JLanguageTool/src/resource/en/remap.awk')
-rw-r--r--JLanguageTool/src/resource/en/remap.awk304
1 files changed, 304 insertions, 0 deletions
diff --git a/JLanguageTool/src/resource/en/remap.awk b/JLanguageTool/src/resource/en/remap.awk
new file mode 100644
index 0000000..0e795c3
--- /dev/null
+++ b/JLanguageTool/src/resource/en/remap.awk
@@ -0,0 +1,304 @@
+BEGIN {#FS=":|\|"
+#Noun
+map["N"]="NN"
+#Plural
+map["p"]="NNS"
+#Noun Phrase
+map["h"]="" #delete this in FSA
+#Verb (usu participle)
+map["V"]="VBG"
+#Verb (transitive)
+map["t"]="VB"
+#Verb (intransitive)
+map["i"]="VB" #no difference here! use a new tag?
+#Adjective
+map["A"]="JJ"
+#Adverb
+map["v"]="RB"
+#Conjunction
+map["C"]="CC"
+#Preposition
+map["P"]="IN"
+#Interjection
+map["!"]="UH"
+#Pronoun
+map["r"]="PRP"
+#Definite Article
+map["D"]="DT"
+#Indefinite Article
+map["I"]="DT"
+#Nominative
+map["o"]="" # no such tags in Moby?
+#Technical mark
+map["\|"]="" #delete this, unusable
+map["\?"]="" #as above
+}
+#additional rules: JJ & "er"_END_OF_WORD=JJR
+#additional rules: JJ & "est"_END_OF_WORD=JJS
+#NN & Uppercase = NNP
+#NNS & Uppercase = NNPS
+#verbs: case of 5 fields
+#give V: gave | given | giving | gives
+# VBP VBD VBN VBG VBZ
+#in case of 4 fields: VBD=VBN.
+#PRP$, WDT, WP, WP$, WRB only by enumeration
+#MD by enumeration
+
+#kill single-letter entries:
+/^[a-zA-Z][\t ]/ {$0=""}
+
+#adjectives
+/A:|A\?:/ && !/'/ {
+gsub(/{[a-zA-Z:0-9_]+}/,"")
+gsub(/,/,"")
+gsub(/[0-9]/,"")
+split ($0, adjective, /\|/)
+#print "1" adjective[1]
+#print "2" adjective[2]
+split(adjective[1], jjr,":")
+split(jjr[2], jjr_forms)
+split(adjective[2],jjs_forms)
+gsub(/\|/,"")
+mark=0
+total=0
+if ($1"_END"!~/ly_END/ || $1"_END"~/early_END/) print $1 "\t" $1 "\tJJ"; else print $1 "\t" $1 "\tRB"
+for (n in jjr_forms) {
+ #if ($i"_END"!~/er_END/)
+ if (jjr_forms[n]!~/\?|<|\.|!/)
+ {print jjr_forms[n] "\t" $1 "\tJJR"
+ JJR[jjr_forms[n]]=$1
+ set++
+ }
+ }
+for (n in jjs_forms) {
+ if (jjs_forms[n]!~/\?|<|\.|!/)
+ {print jjs_forms[n] "\t" $1 "\tJJS"
+ JJS[jjs_forms[n]]=$1
+ set++
+ }
+ }
+}
+
+/N:|N\?:/ && !/'/ {
+if ($1~/less/ && $2~/N\?:/ && NF==3) {
+ print $1 "\t" $1 "\tJJ"
+}
+else {
+if ($1!~/[A-Z]/) print $1 "\t" $1 "\tNN"; else print $1 "\t" $1 "\tNNP"
+gsub(/[,<]/,"")
+gsub(/{[a-zA-Z:0-9_]+}/,"")
+gsub(/[0-9]\.[0-9]/,"")
+gsub(/[0-9]+ /,"")
+for (i=3;i<=NF;i++) {
+# print ">>" $i
+# print "NF=" NF, "string is:" $0
+ if ($i!~/[\?\~\!]/ && "PFX"$i"SFX"!~/PFX([0-9]+|\|)SFX/)
+ if ($i!~/[A-Z]/) {print $i "\t" $1 "\tNNS"
+ nns[$i]=$1
+ }
+ else
+ print $i "\t" $1 "\tNNPS"
+}
+}
+}
+
+/V:/ && !/'/ {
+gsub(/,/,"")
+gsub(/{[a-zA-Z:0-9_]+}/,"")
+gsub(/[0-9]\.[0-9]/,"")
+gsub(/[0-9]+ /,"")
+split($0,verb_fields,/\||:/)
+#for (kk=1;kk<=10;kk++) {
+# print kk verb_fields[kk]
+# }
+if (verb_fields[5]=="") {
+ print $1 "\t" $1 "\tVB"
+ print $1 "\t" $1 "\tVBP"
+ split (verb_fields[2], VBD, " ")
+ for (n in VBD) {
+ if (VBD[n]!~/[\?\~\!<]/ && "PFX"VBD[n]"SFX"!~/PFX([0-9]+|\|)SFX/) print VBD[n] "\t" $1 "\tVBD"
+ if (VBD[n]!~/[\?\~\!<]/ && "PFX"VBD[n]"SFX"!~/PFX([0-9]+|\|)SFX/) print VBD[n] "\t" $1 "\tVBN"
+ }
+ split (verb_fields[3], VBG, " ")
+ for (n in VBG) {
+ if (VBG[n]!~/[\?\~\!<]/ && "PFX"VBG[n]"SFX"!~/PFX([0-9]+|\|)SFX/) print VBG[n] "\t" $1 "\tVBG"
+ }
+ split (verb_fields[4], VBZ, " ")
+ for (n in VBZ) {
+ if (VBZ[n]!~/[\?\~\!<]/ && "PFX"VBZ[n]"SFX"!~/PFX([0-9]+|\|)SFX/) print VBZ[n] "\t" $1 "\tVBZ"
+ }
+ }
+if (verb_fields[5]!="") {
+ print $1 "\t" $1 "\tVB"
+ print $1 "\t" $1 "\tVBP"
+ split (verb_fields[2], VBD, " ")
+ for (n in VBD) {
+ if (VBD[n]!~/[\?\~\!<]/ && "PFX"VBD[n]"SFX"!~/PFX([0-9]+|\|)SFX/) print VBD[n] "\t" $1 "\tVBD"
+ }
+ split (verb_fields[3], VBN, " ")
+ for (n in VBN) {
+ if (VBN[n]!~/[\?\~\!<]/ && "PFX"VBN[n]"SFX"!~/PFX([0-9]+|\|)SFX/) print VBN[n] "\t" $1 "\tVBN"
+ }
+ split (verb_fields[4], VBG, " ")
+ for (n in VBG) {
+ if (VBG[n]!~/[\?\~\!<]/ && "PFX"VBG[n]"SFX"!~/PFX([0-9]+|\|)SFX/) print VBG[n] "\t" $1 "\tVBG"
+ }
+ split (verb_fields[5], VBZ, " ")
+ for (n in VBZ) {
+ if (VBZ[n]!~/[\?\~\!<]/ && "PFX"VBZ[n]"SFX"!~/PFX([0-9]+|\|)SFX/) print VBZ[n] "\t" $1 "\tVBZ"
+ }
+ }
+}
+
+
+"BEGIN"$2"END"~/BEGINCEND/ { print $1 "\t" $1 "\tCC"}
+"BEGIN"$2"END"~/BEGINPEND/ { print $1 "\t" $1 "\t"map[$2]}
+"BEGIN"$2"END"~/BEGIN\!END/ { print $1 "\t" $1 "\t"map[$2]}
+#aaaa "BEGIN"$2"END"~/BEGINDEND/ { print $1 "\t" $1 "\t"map[$2]} #its as determiner!
+"BEGIN"$2"END"~/BEGINIEND/ && !/ /{ print $1 "\t" $1 "\t"map[$2]}
+"BEGIN"$2"END"~/BEGINrEND/ { print $1 "\t" $1 "\t"map[$2]}
+"BEGIN"$2"END"~/BEGINvEND/ { print $1 "\t" $1 "\t"map[$2]}
+#"BEGIN"$2"END"~/BEGINANEND/ { if (JJR[$1]=="" && JJS[$1]=="") print $1 "\t" $1 "\tJJ"}
+
+#interjections
+/!/ && !/[ ']/ {
+if ($1!~/!/) {
+ print $1 "\t" $1 "\t"map["!"]
+ gsub(/!/,"")
+ }
+ }
+
+
+/\t(vA|Av)$/ && !/[ ']/ {print $1 "\t" $1 "\t"map["v"]
+ print $1 "\t" $1 "\t"map["A"]}
+
+/\tvAN$/ && !/[ ']/ {print $1 "\t" $1 "\t"map["v"]
+ print $1 "\t" $1 "\t"map["A"]
+ print $1 "\t" $1 "\t"map["N"]
+ }
+
+/\tvAtV$/ && !/[ ']/ {print $1 "\t" $1 "\t"map["v"]
+gsub(/\tvAtV/,"\tAVt")
+}
+
+/\tv$/ && !/[ ']/ {print $1 "\t" $1 "\t"map["v"]}
+
+/\tvP$/ && !/[ ']/ {print $1 "\t" $1 "\t"map["v"]
+print $1 "\t" $1 "\t"map["P"]
+}
+
+/\tvN$/ && !/[ ']/ {print $1 "\t" $1 "\t"map["v"]
+print $1 "\t" $1 "\t"map["N"]
+}
+
+/\t(\|v|vP)/ && !/[ ']/ {
+ if (JJR[$1]!="") {
+ print $1 "\t" $1 "\tRBR"}
+ if (JJS[$1]!="") {
+ print $1 "\t" $1 "\tRBS"}
+ if (JJS[$1]=="" && JJS[$1]=="") {
+ print $1 "\t" $1 "\t"map["v"]}
+ }
+
+{gsub(/\tvPC/, "\tPCv")
+ gsub(/\tvC/,"\tCv/")
+}
+
+/\tPCv/ {print $1 "\t" $1 "\t"map["P"]
+ print $1 "\t" $1 "\t"map["C"]
+ print $1 "\t" $1 "\t"map["v"]}
+
+/\tCv/ {print $1 "\t" $1 "\t"map["C"]
+ print $1 "\t" $1 "\t"map["v"]}
+
+/\tANtV/ {if (JJR[$1]=="" && JJS[$1]=="")
+ if ($1"_END"!~/ly_END/ || $1"_END"~/early_END/) print $1 "\t" $1 "\tJJ"; else print $1 "\t" $1 "\tRB"
+ if (nns[$1]=="" || $1"_END"~/ics_END/) {
+ if ($1!~/[A-Z]/) print $1 "\t" $1 "\tNN"; else print $1 "\t" $1 "\tNNP"
+ }
+ print $1 "\t" $1 "\t"map["t"]
+ print $1 "\t" $1 "\t"map["V"]
+ }
+/\tvNA$/ && !/'/ {
+print $1 "\t" $1 "\t"map["v"]
+gsub(/\tvNA/,"\tNA")
+}
+
+/\t[AN][AN]/ && !/'/ {
+ if (JJR[$1]=="" && JJS[$1]=="" && $1"_END"!~/ism_END/) print $1 "\t" $1 "\tJJ"
+ if ($1!~/[A-Z]/) print $1 "\t" $1 "\tNN"; else print $1 "\t" $1 "\tNNP"
+ }
+/\tN$/ && !/[ ']/ {
+ if (nns[$1]=="" || $1"_END"~/ics_END/) {
+ if ($1!~/[A-Z]/) print $1 "\t" $1 "\tNN"; else print $1 "\t" $1 "\tNNP"
+ }
+ }
+/\t\|N$/ && !/[ ']/ && !/^[0-9\.]+\t/{
+ if (nns[$1]=="" || $1"_END"~/ics_END/) {
+ if ($1!~/[A-Z]/) print $1 "\t" $1 "\tNN"; else print $1 "\t" $1 "\tNNP"
+ }
+ }
+/\t[N!][N!]/ && !/[ ']/ {
+ if (nns[$1]=="" || $1"_END"~/ics_END/) {
+ if ($1!~/[A-Z]/) print $1 "\t" $1 "\tNN"; else print $1 "\t" $1 "\tNNP"
+ }
+ print $1 "\t" $1 "\t"map["!"]
+ }
+
+/\tA$/ && !/[ ']/{
+ if (JJR[$1]=="" && JJS[$1]=="") {
+ if ($1"_END"!~/ly_END/ || $1"_END"~/early_END/) print $1 "\t" $1 "\tJJ"; else print $1 "\t" $1 "\tRB"}
+ #print $1 "\t" $1 "\t"map["A"]
+ }
+
+/\t\|NA$/ && !/[ ']/ && !/^[0-9\.]+\t/{
+ if (JJR[$1]=="" && JJS[$1]=="")
+ if ($1"_END"!~/ly_END/ || $1"_END"~/early_END/) print $1 "\t" $1 "\tJJ"; else print $1 "\t" $1 "\tRB"
+ }
+
+/\tpN$/ && !/[ ']/ {if (nns[$1]=="") {if ($1!~/[A-Z]/) print $1 "\t" $1 "\tNNS"; else print $1 "\t" $1 "\tNNPS"}
+ }
+/\tp$/ && !/[ ']/ {if (nns[$1]=="") {if ($1~/[A-Z]/) print $1 "\t" $1 "\tNNPS"}
+ }
+/\tDA$/ && !/[ ']/{
+# if (JJR[$1]=="" && JJS[$1]=="")
+# print $1 "\t" $1 "\t"map["A"]
+ print $1 "\t" $1 "\t"map["D"]
+ }
+/\tAv$/ && !/[ ']/{
+ if (JJR[$1]=="" && JJS[$1]=="")
+ if ($1"_END"!~/ly_END/ || $1"_END"~/early_END/) print $1 "\t" $1 "\tJJ"; else print $1 "\t" $1 "\tRB"
+# print $1 "\t" $1 "\t"map["v"]
+}
+/\t(AV|AVti|AVt)$/ && !/[ ']/{ #AVti - only two words: articulate, foliate
+ if (JJR[$1]=="" && JJS[$1]=="")
+ if ($1"_END"!~/ly_END/ || $1"_END"~/early_END/) print $1 "\t" $1 "\tJJ"; else print $1 "\t" $1 "\tRB"
+# print $1 "\t" $1 "\t"map["V"]
+#this is wrong, actually V = VBG & VBN
+}
+
+
+/\t\|A$/ && !/[ ']/{
+ if (JJR[$1]=="" && JJS[$1]=="")
+ if ($1"_END"!~/ly_END/ || $1"_END"~/early_END/) print $1 "\t" $1 "\tJJ"; else print $1 "\t" $1 "\tRB"
+ }
+
+/\t\|Av$/ && !/[ ']/{
+ if (JJR[$1]=="" && JJS[$1]=="")
+ if ($1"_END"!~/ly_END/ || $1"_END"~/early_END/) print $1 "\t" $1 "\tJJ"; else print $1 "\t" $1 "\tRB"
+
+ }
+
+/\tAtNV$/ && !/[ ']/ {
+ if (JJR[$1]=="" && JJS[$1]=="")
+ if ($1"_END"!~/ly_END/ || $1"_END"~/early_END/) print $1 "\t" $1 "\tJJ"; else print $1 "\t" $1 "\tRB"
+ if (nns[$1]=="" || $1"_END"~/ics_END/) {
+ if ($1!~/[A-Z]/) print $1 "\t" $1 "\tNN"; else print $1 "\t" $1 "\tNNP"
+ }
+ print $1 "\t" $1 "\t"map["t"]
+ print $1 "\t" $1 "\t"map["V"]
+ }
+
+/\t\!$/ && !/[ ']/{
+ print $1 "\t" $1 "\t"map["!"]
+ }