diff options
Diffstat (limited to 'JLanguageTool/src/resource/en/remap.awk')
-rw-r--r-- | JLanguageTool/src/resource/en/remap.awk | 304 |
1 files changed, 304 insertions, 0 deletions
diff --git a/JLanguageTool/src/resource/en/remap.awk b/JLanguageTool/src/resource/en/remap.awk new file mode 100644 index 0000000..0e795c3 --- /dev/null +++ b/JLanguageTool/src/resource/en/remap.awk @@ -0,0 +1,304 @@ +BEGIN {#FS=":|\|" +#Noun +map["N"]="NN" +#Plural +map["p"]="NNS" +#Noun Phrase +map["h"]="" #delete this in FSA +#Verb (usu participle) +map["V"]="VBG" +#Verb (transitive) +map["t"]="VB" +#Verb (intransitive) +map["i"]="VB" #no difference here! use a new tag? +#Adjective +map["A"]="JJ" +#Adverb +map["v"]="RB" +#Conjunction +map["C"]="CC" +#Preposition +map["P"]="IN" +#Interjection +map["!"]="UH" +#Pronoun +map["r"]="PRP" +#Definite Article +map["D"]="DT" +#Indefinite Article +map["I"]="DT" +#Nominative +map["o"]="" # no such tags in Moby? +#Technical mark +map["\|"]="" #delete this, unusable +map["\?"]="" #as above +} +#additional rules: JJ & "er"_END_OF_WORD=JJR +#additional rules: JJ & "est"_END_OF_WORD=JJS +#NN & Uppercase = NNP +#NNS & Uppercase = NNPS +#verbs: case of 5 fields +#give V: gave | given | giving | gives +# VBP VBD VBN VBG VBZ +#in case of 4 fields: VBD=VBN. +#PRP$, WDT, WP, WP$, WRB only by enumeration +#MD by enumeration + +#kill single-letter entries: +/^[a-zA-Z][\t ]/ {$0=""} + +#adjectives +/A:|A\?:/ && !/'/ { +gsub(/{[a-zA-Z:0-9_]+}/,"") +gsub(/,/,"") +gsub(/[0-9]/,"") +split ($0, adjective, /\|/) +#print "1" adjective[1] +#print "2" adjective[2] +split(adjective[1], jjr,":") +split(jjr[2], jjr_forms) +split(adjective[2],jjs_forms) +gsub(/\|/,"") +mark=0 +total=0 +if ($1"_END"!~/ly_END/ || $1"_END"~/early_END/) print $1 "\t" $1 "\tJJ"; else print $1 "\t" $1 "\tRB" +for (n in jjr_forms) { + #if ($i"_END"!~/er_END/) + if (jjr_forms[n]!~/\?|<|\.|!/) + {print jjr_forms[n] "\t" $1 "\tJJR" + JJR[jjr_forms[n]]=$1 + set++ + } + } +for (n in jjs_forms) { + if (jjs_forms[n]!~/\?|<|\.|!/) + {print jjs_forms[n] "\t" $1 "\tJJS" + JJS[jjs_forms[n]]=$1 + set++ + } + } +} + +/N:|N\?:/ && !/'/ { +if ($1~/less/ && $2~/N\?:/ && NF==3) { + print $1 "\t" $1 "\tJJ" +} +else { +if ($1!~/[A-Z]/) print $1 "\t" $1 "\tNN"; else print $1 "\t" $1 "\tNNP" +gsub(/[,<]/,"") +gsub(/{[a-zA-Z:0-9_]+}/,"") +gsub(/[0-9]\.[0-9]/,"") +gsub(/[0-9]+ /,"") +for (i=3;i<=NF;i++) { +# print ">>" $i +# print "NF=" NF, "string is:" $0 + if ($i!~/[\?\~\!]/ && "PFX"$i"SFX"!~/PFX([0-9]+|\|)SFX/) + if ($i!~/[A-Z]/) {print $i "\t" $1 "\tNNS" + nns[$i]=$1 + } + else + print $i "\t" $1 "\tNNPS" +} +} +} + +/V:/ && !/'/ { +gsub(/,/,"") +gsub(/{[a-zA-Z:0-9_]+}/,"") +gsub(/[0-9]\.[0-9]/,"") +gsub(/[0-9]+ /,"") +split($0,verb_fields,/\||:/) +#for (kk=1;kk<=10;kk++) { +# print kk verb_fields[kk] +# } +if (verb_fields[5]=="") { + print $1 "\t" $1 "\tVB" + print $1 "\t" $1 "\tVBP" + split (verb_fields[2], VBD, " ") + for (n in VBD) { + if (VBD[n]!~/[\?\~\!<]/ && "PFX"VBD[n]"SFX"!~/PFX([0-9]+|\|)SFX/) print VBD[n] "\t" $1 "\tVBD" + if (VBD[n]!~/[\?\~\!<]/ && "PFX"VBD[n]"SFX"!~/PFX([0-9]+|\|)SFX/) print VBD[n] "\t" $1 "\tVBN" + } + split (verb_fields[3], VBG, " ") + for (n in VBG) { + if (VBG[n]!~/[\?\~\!<]/ && "PFX"VBG[n]"SFX"!~/PFX([0-9]+|\|)SFX/) print VBG[n] "\t" $1 "\tVBG" + } + split (verb_fields[4], VBZ, " ") + for (n in VBZ) { + if (VBZ[n]!~/[\?\~\!<]/ && "PFX"VBZ[n]"SFX"!~/PFX([0-9]+|\|)SFX/) print VBZ[n] "\t" $1 "\tVBZ" + } + } +if (verb_fields[5]!="") { + print $1 "\t" $1 "\tVB" + print $1 "\t" $1 "\tVBP" + split (verb_fields[2], VBD, " ") + for (n in VBD) { + if (VBD[n]!~/[\?\~\!<]/ && "PFX"VBD[n]"SFX"!~/PFX([0-9]+|\|)SFX/) print VBD[n] "\t" $1 "\tVBD" + } + split (verb_fields[3], VBN, " ") + for (n in VBN) { + if (VBN[n]!~/[\?\~\!<]/ && "PFX"VBN[n]"SFX"!~/PFX([0-9]+|\|)SFX/) print VBN[n] "\t" $1 "\tVBN" + } + split (verb_fields[4], VBG, " ") + for (n in VBG) { + if (VBG[n]!~/[\?\~\!<]/ && "PFX"VBG[n]"SFX"!~/PFX([0-9]+|\|)SFX/) print VBG[n] "\t" $1 "\tVBG" + } + split (verb_fields[5], VBZ, " ") + for (n in VBZ) { + if (VBZ[n]!~/[\?\~\!<]/ && "PFX"VBZ[n]"SFX"!~/PFX([0-9]+|\|)SFX/) print VBZ[n] "\t" $1 "\tVBZ" + } + } +} + + +"BEGIN"$2"END"~/BEGINCEND/ { print $1 "\t" $1 "\tCC"} +"BEGIN"$2"END"~/BEGINPEND/ { print $1 "\t" $1 "\t"map[$2]} +"BEGIN"$2"END"~/BEGIN\!END/ { print $1 "\t" $1 "\t"map[$2]} +#aaaa "BEGIN"$2"END"~/BEGINDEND/ { print $1 "\t" $1 "\t"map[$2]} #its as determiner! +"BEGIN"$2"END"~/BEGINIEND/ && !/ /{ print $1 "\t" $1 "\t"map[$2]} +"BEGIN"$2"END"~/BEGINrEND/ { print $1 "\t" $1 "\t"map[$2]} +"BEGIN"$2"END"~/BEGINvEND/ { print $1 "\t" $1 "\t"map[$2]} +#"BEGIN"$2"END"~/BEGINANEND/ { if (JJR[$1]=="" && JJS[$1]=="") print $1 "\t" $1 "\tJJ"} + +#interjections +/!/ && !/[ ']/ { +if ($1!~/!/) { + print $1 "\t" $1 "\t"map["!"] + gsub(/!/,"") + } + } + + +/\t(vA|Av)$/ && !/[ ']/ {print $1 "\t" $1 "\t"map["v"] + print $1 "\t" $1 "\t"map["A"]} + +/\tvAN$/ && !/[ ']/ {print $1 "\t" $1 "\t"map["v"] + print $1 "\t" $1 "\t"map["A"] + print $1 "\t" $1 "\t"map["N"] + } + +/\tvAtV$/ && !/[ ']/ {print $1 "\t" $1 "\t"map["v"] +gsub(/\tvAtV/,"\tAVt") +} + +/\tv$/ && !/[ ']/ {print $1 "\t" $1 "\t"map["v"]} + +/\tvP$/ && !/[ ']/ {print $1 "\t" $1 "\t"map["v"] +print $1 "\t" $1 "\t"map["P"] +} + +/\tvN$/ && !/[ ']/ {print $1 "\t" $1 "\t"map["v"] +print $1 "\t" $1 "\t"map["N"] +} + +/\t(\|v|vP)/ && !/[ ']/ { + if (JJR[$1]!="") { + print $1 "\t" $1 "\tRBR"} + if (JJS[$1]!="") { + print $1 "\t" $1 "\tRBS"} + if (JJS[$1]=="" && JJS[$1]=="") { + print $1 "\t" $1 "\t"map["v"]} + } + +{gsub(/\tvPC/, "\tPCv") + gsub(/\tvC/,"\tCv/") +} + +/\tPCv/ {print $1 "\t" $1 "\t"map["P"] + print $1 "\t" $1 "\t"map["C"] + print $1 "\t" $1 "\t"map["v"]} + +/\tCv/ {print $1 "\t" $1 "\t"map["C"] + print $1 "\t" $1 "\t"map["v"]} + +/\tANtV/ {if (JJR[$1]=="" && JJS[$1]=="") + if ($1"_END"!~/ly_END/ || $1"_END"~/early_END/) print $1 "\t" $1 "\tJJ"; else print $1 "\t" $1 "\tRB" + if (nns[$1]=="" || $1"_END"~/ics_END/) { + if ($1!~/[A-Z]/) print $1 "\t" $1 "\tNN"; else print $1 "\t" $1 "\tNNP" + } + print $1 "\t" $1 "\t"map["t"] + print $1 "\t" $1 "\t"map["V"] + } +/\tvNA$/ && !/'/ { +print $1 "\t" $1 "\t"map["v"] +gsub(/\tvNA/,"\tNA") +} + +/\t[AN][AN]/ && !/'/ { + if (JJR[$1]=="" && JJS[$1]=="" && $1"_END"!~/ism_END/) print $1 "\t" $1 "\tJJ" + if ($1!~/[A-Z]/) print $1 "\t" $1 "\tNN"; else print $1 "\t" $1 "\tNNP" + } +/\tN$/ && !/[ ']/ { + if (nns[$1]=="" || $1"_END"~/ics_END/) { + if ($1!~/[A-Z]/) print $1 "\t" $1 "\tNN"; else print $1 "\t" $1 "\tNNP" + } + } +/\t\|N$/ && !/[ ']/ && !/^[0-9\.]+\t/{ + if (nns[$1]=="" || $1"_END"~/ics_END/) { + if ($1!~/[A-Z]/) print $1 "\t" $1 "\tNN"; else print $1 "\t" $1 "\tNNP" + } + } +/\t[N!][N!]/ && !/[ ']/ { + if (nns[$1]=="" || $1"_END"~/ics_END/) { + if ($1!~/[A-Z]/) print $1 "\t" $1 "\tNN"; else print $1 "\t" $1 "\tNNP" + } + print $1 "\t" $1 "\t"map["!"] + } + +/\tA$/ && !/[ ']/{ + if (JJR[$1]=="" && JJS[$1]=="") { + if ($1"_END"!~/ly_END/ || $1"_END"~/early_END/) print $1 "\t" $1 "\tJJ"; else print $1 "\t" $1 "\tRB"} + #print $1 "\t" $1 "\t"map["A"] + } + +/\t\|NA$/ && !/[ ']/ && !/^[0-9\.]+\t/{ + if (JJR[$1]=="" && JJS[$1]=="") + if ($1"_END"!~/ly_END/ || $1"_END"~/early_END/) print $1 "\t" $1 "\tJJ"; else print $1 "\t" $1 "\tRB" + } + +/\tpN$/ && !/[ ']/ {if (nns[$1]=="") {if ($1!~/[A-Z]/) print $1 "\t" $1 "\tNNS"; else print $1 "\t" $1 "\tNNPS"} + } +/\tp$/ && !/[ ']/ {if (nns[$1]=="") {if ($1~/[A-Z]/) print $1 "\t" $1 "\tNNPS"} + } +/\tDA$/ && !/[ ']/{ +# if (JJR[$1]=="" && JJS[$1]=="") +# print $1 "\t" $1 "\t"map["A"] + print $1 "\t" $1 "\t"map["D"] + } +/\tAv$/ && !/[ ']/{ + if (JJR[$1]=="" && JJS[$1]=="") + if ($1"_END"!~/ly_END/ || $1"_END"~/early_END/) print $1 "\t" $1 "\tJJ"; else print $1 "\t" $1 "\tRB" +# print $1 "\t" $1 "\t"map["v"] +} +/\t(AV|AVti|AVt)$/ && !/[ ']/{ #AVti - only two words: articulate, foliate + if (JJR[$1]=="" && JJS[$1]=="") + if ($1"_END"!~/ly_END/ || $1"_END"~/early_END/) print $1 "\t" $1 "\tJJ"; else print $1 "\t" $1 "\tRB" +# print $1 "\t" $1 "\t"map["V"] +#this is wrong, actually V = VBG & VBN +} + + +/\t\|A$/ && !/[ ']/{ + if (JJR[$1]=="" && JJS[$1]=="") + if ($1"_END"!~/ly_END/ || $1"_END"~/early_END/) print $1 "\t" $1 "\tJJ"; else print $1 "\t" $1 "\tRB" + } + +/\t\|Av$/ && !/[ ']/{ + if (JJR[$1]=="" && JJS[$1]=="") + if ($1"_END"!~/ly_END/ || $1"_END"~/early_END/) print $1 "\t" $1 "\tJJ"; else print $1 "\t" $1 "\tRB" + + } + +/\tAtNV$/ && !/[ ']/ { + if (JJR[$1]=="" && JJS[$1]=="") + if ($1"_END"!~/ly_END/ || $1"_END"~/early_END/) print $1 "\t" $1 "\tJJ"; else print $1 "\t" $1 "\tRB" + if (nns[$1]=="" || $1"_END"~/ics_END/) { + if ($1!~/[A-Z]/) print $1 "\t" $1 "\tNN"; else print $1 "\t" $1 "\tNNP" + } + print $1 "\t" $1 "\t"map["t"] + print $1 "\t" $1 "\t"map["V"] + } + +/\t\!$/ && !/[ ']/{ + print $1 "\t" $1 "\t"map["!"] + } |