#!/bin/sh
############################################################
#        SCRIPT FOR FRENCH PART OF SPEECH TAGGING
# for Helmut Schmid's TreeTagger with Achim Stein's lexicon
############################################################
# Achim Stein
# Universitaet Stuttgart, Institut fuer Linguistik-Romanistik
# Keplerstrasse 17, D-70174 Stuttgart
# e-mail: achim@ims.uni-stuttgart.de
# October 1997
###########################################################################

# Use Option -h to display a help screen.

# THESE VARIABLES HAVE TO BE SET:

BIN=/usr/local/durmtools/TreeTagger/bin
LIB=/usr/local/durmtools/TreeTagger/lib

# set your default options for the Tree Tagger
TAGGEROPTS="-token -lemma -sgml"

# set the path for the tagger command
TAGGERCMD=${BIN}/tree-tagger

# set the path for the parameter file
PARAMETERS=${LIB}/french.par

############################################################
# The script requires gawk (we used V2.15), tr, grep,
# the tree-tagger and the parameter file.
# 
# Input Files have to be in ISO-Latin-1, SGML-Codes (if any)
# should be surrounded by carriage returns.
#
# WHAT THIS SCRIPT DOES:
#
# 1. PRE-PROCESSING:
# The tokenization converts french text into a one-word-per-line
# format which complies with our lexicon entries and with the
# Parameter File(s) we distribute.
#
# 2. TAGGING with the options defined below or on the command line
#
# 3. POST-PROCESSING:
# Some frequent errors are eliminated by the awk script following
# the tagger command.
#
# 4. OPTIONALLY (if -m is set) MARKS SENTENCE BOUNDARIES:
#
###########################################################################

HELP=0
FILTER=0

while getopts fho:p: myopts
do case $myopts in
   f) FILTER=1
      TAGGERCMD=cat
      TAGGEROPTS=""
      PARAMETERS="";;
   h) HELP=1;;
   o) TAGGEROPTS="$OPTARG";;
   p) PARAMETERS="$OPTARG";;
   esac
done
shift `expr $OPTIND - 1`


if [ $# -eq 1 ]
then INPUT=$1
else INPUT="$*"
fi

if [ "$HELP" -gt 0 ] || [ $# -gt 1 ];  then
cat << EOM
SYNTAX:
  tree-tagger-french [Script-Options] [-o 'TreeTagger options'] [input]
DESCRIPTION:
- tokenizes and morphologically analyzes French texts
- corrects the results
- reads from stdin (unless input is specified), writes to stdout
- requires Helmut Schmid's TreeTagger
- requires Gnu Awk (Tested with gawk version 2.15, patchlevel 4)
OPTIONS:
 -p file   parameter file (Default: $PARAMETERS)
TREETAGGER-OPTIONS:
EOM
$TAGGERCMD
exit
fi

####### Tokenization for French texts
cat -s $INPUT |\

gawk '
# SGML-Codes
/^<.*>$/ { gsub(/ /, "~"); print; next }
/<.*>/ {
   gsub(/>/, "> ")
}
{
# cut punctuation off
  gsub(/\047/, "\047 ")
  gsub(/  *%/, "%")
  gsub(/\.\.\./, " ___ ")
  gsub(/"/, " & ")
  gsub(/[\.,;:!\?\)\]]/, " &")
  gsub(/[\(\[]/, "& ")
  gsub(/___/, "...")
  gsub(/\#/, "")
  gsub(/---?/, " - ")
# Strip leading and trailing spaces
  gsub(/^ */, "")
  gsub(/ *$/, "")
}
{ print }' |\

# One word per line

tr ' ' '\12' |\
grep -v '^$' |\


############# Handle exceptions:

gawk '

# Abbreviations which are in the lexicon/parameter file)
$0~/^\.$/ && p1~/^\*?(\..|Ets|Inc|M|MM|Mme|Mlle|Mr|etc|tél)$/ { append(N) }
$0~/^(er|ère)$/ && p1~/^[1Ii]$/ { append(N) }
$0~/^nde?$/ && p1~/^(2|II|ii)$/ { append(N) }

# Abbreviations of type "U.e.f.a." and telephone numbers
$0~/^\.[^\.]/ { append(N) }
# decimal numbers
$0~/^,[0-9]+/ && p1~/[0-9]+/ { append(N) }
# append series of numbers (e.g. 300 000)
$0~/^[0-9]+$/ && p1~/^[0-9]+$/ { append(N) }

# print the remaining cases
{printf "\n%s", $0; stack(N)}
END {printf "\n"}

function append(N) {
  printf"%s", $0
  stack(N); next
}
function append2(N) {
  printf"_%s\n%s", $0, f1
  p1=$0; $0=f1
  stack(N); next
}
function stack(N) {
  p1=$0
}
' |\

# Hyphens

gawk '
############### Do not separate (forms are in the lexicon)

$1~/^([mM]oi|\
[tT]oi|\
[lL]ui|\
[eE]lle|\
[nN]ous|\
[vV]ous|\
[eE]ux|\
[eE]lles)-mêmes?$/ {
  drucke($0)
next
}

$1~/^([cC]elle|\
[cC]elles|\
[cC]elui|\
[cC]eux|\
[pP]ar)-(ci|là)$/ {
  drucke($0)
next
}

$1~/^rendez-vous|garde-à-vous$/ {
  drucke($0)
next
}

################ Separate: e.g. -il, -t-il

$1 ~ /-(ce|ci|là|elle|elles|il|ils|je|la|le|les|leur|lui|même|mêmes|m\047|moi|nous|on|toi|tu|t\047|vous|en|y)$/ && (substr($1, 1, 1) != "-") {
  max = split($1, wort, "-")
   print lastword; lastword=""
    print wort[1]
      for(i=2; i<=max; i++) {
	if(wort[i] == "t") {
	  printf "-" wort[i]      # ohne CR: -t-elle ist Lexikoneintrag
	    continue
	}
	print "-" wort[i]
      }
     next
}

############## Append words which should not be separated

$1 ~ /^hui$/ {
  if(match(lastword, /^[aA]ujourd\047$/)==1) joinwithlast($1)
}

$1 ~ /^est-à-dire$/ {
  if(match(lastword, /^[cC]\047$/)==1) joinwithlast($1)
}

$1 ~ /^(abord|ailleurs|après|autant)$/ {
  if(match(lastword, /^[dD]\047$/)==1) joinwithlast($1)
}

$1 ~ /^oeuvres?$/ {
  if(match(lastword, /^d\047$/)==1) joinwithlast($1)
}

$1 ~ /^%$/ {
  if(match(lastword, /^[0-9,\.]+$/)==1) joinwithlast($1)
}

# grand`père
$1 ~ /^[gG]rand\047$/ {
  drucke(N)
   printf $0
   getline f1
    if(match(f1, /mère|père/) == 1) print f1
      else {printf "\n"; lastword=f1}
      next
}
# p`tit
$1 ~ /^p\047$/ {
  drucke(N)
   printf $0
   getline f1
    if(match(f1, /tit/) == 1) print f1
      else {printf "\n"; lastword=f1}
      next
}
# quelqu`un
$1 ~ /^[qQ]uelqu\047$/ {
  drucke(N)
   printf $0
   getline f1
    if(match(f1, /un/) == 1) print f1
      else {printf "\n"; lastword=f1}
      next
}
# entr`aimer
$1 ~ /^[eE]ntr\047$/ {
  drucke(N)
   printf $0
   getline f1
    print f1
     lastword=f1
      next
}
# Mam`zelle
$1 ~ /^[mM]am\047$/ {
  drucke(N)
   printf $0
   getline f1
    if(match(f1, /zelle/) == 1) print f1
      else {printf "\n"; lastword=f1}
      next
}
# v`là
$1 ~ /^[vV]\047$/ {
  drucke(N)
   printf $0
   getline f1
    if(match(f1, /là/) == 1) print f1
      else {printf "\n"; lastword=f1}
      next
}

############## Default

{ drucke($0) }

END { print lastword }

############# functions

function drucke(N) {
  if(NR>1) {
    if(lastword != "") print lastword
  }
  lastword = N
}

function joinwithlast(N) {
  printf lastword
    lastword = N
      next
}'  |\


$TAGGERCMD $PARAMETERS $TAGGEROPTS  |\


###################  Improvement of the Tagging Results

gawk '

# Fehler 1: VER: statt VER:aux
# Regel:  ersetze, wenn VER-VER oder VER-ADV-VER oder 
# Effekt: +0.04%
$2~/VER:[^a]/ && $3~/\352tre|avoir/{
  p1 = $0
  getline f1
  if( (getline f2) == 0) {
    print p1; print f1; last=f1; next
  } 
  if((match(f1, /VER:pper/)>0) || (match(f2, /VER:pper/)>0)) {
    gsub(/VER:/, "VER:aux:", p1)
    print p1
    print f1
    print f2; last=f2
    count1++; next
  }
  print p1; print f1; print f2; last=f2; next
}

# Fehler 3: pour PRE statt CON:sub
# Regel:  ersetze, wenn direkt danach ein VERb steht
# Effekt: +0,09%
$1~/^pour/ && $2~/PRE/ {
  p1 = $0
  getline f1
  if(match(f1, /VER:/)>0) {
    gsub(/PRE/, "CON:sub", p1)
    print p1
    print f1; last=f1
    count3++; next
  }
  print p1; print f1; last=f1; next
}

# Fehler 4: grâce à/au/aux
# Regel:  ersetze, wenn danach à kommt
# Effekt: +0.06%
$1~/^gr\342ce/ {
  p1 = $0
  getline f1
  if(match(f1, /(\340|au|aux)/)>0) {
    gsub(/NOM:femi:sg/, "PRE:1st", p1)
    gsub(/PRE:det:(femi|masc):(pl|sg)/, "&:2nd", f1)
    gsub(/NOM/, "PRE:1st", p1)
    gsub(/PRE:det$/, "PRE:det:2nd", f1)
    print p1
    print f1; last=f1
    count20++; next
  }
  print p1; print f1; last=f1; next
}

{print;last=$0}
END {
  printf"Rule based modification:\n" > "/dev/stderr"
  printf"-- VER->VER:aux:                    %d\n", count1 > "/dev/stderr"
  printf"-- PRE->CON:sub:                    %d\n", count3 > "/dev/stderr"
  printf"-- grâce à: NOM->PRE:1st            %d\n", count20 > "/dev/stderr"
}
'