#AlternateTokeniser.rules# #diana 28/6/00# #update 25/04/06# #Tokeniser rule file #Each rule should be on one line #Lines that end with "\" are appended with the next one. This facility \ is used for longer rules that cannot be written on a single line # #Lines starting with "#" are treated as comment //Lines starting with "//" are treated as comment # Empty lines are ignored. #A rule has a left hand side (LHS) and a right hand side (RHS); #the RHS is a regular expression tha has to be matched on the input #the LHS describes the annotations to be added to the AnnotationSet. #LHS is separated from the RHS by '>' #LHS knows about the following operators: # + (1..n) # * (0..n) # | (boolean OR) # #RHS uses as separator ';' and has the following format #{LHS} > {Annotation type};{attribute1}={value1};...;{attribute n}={value n} #The primitive constructs are: #UNASSIGNED #UPPERCASE_LETTER #LOWERCASE_LETTER #TITLECASE_LETTER #MODIFIER_LETTER #OTHER_LETTER #NON_SPACING_MARK #ENCLOSING_MARK #COMBINING_SPACING_MARK #DECIMAL_DIGIT_NUMBER #LETTER_NUMBER #OTHER_NUMBER #SPACE_SEPARATOR #LINE_SEPARATOR #PARAGRAPH_SEPARATOR #CONTROL #FORMAT #PRIVATE_USE #SURROGATE #DASH_PUNCTUATION #START_PUNCTUATION #END_PUNCTUATION #CONNECTOR_PUNCTUATION #OTHER_PUNCTUATION #MATH_SYMBOL #CURRENCY_SYMBOL #MODIFIER_SYMBOL #OTHER_SYMBOL #...representing the corresponding enumerated Unicode category types # See java.lang.Character for the Java version you are using #------- The rules start here ----------------- #words# // a word can be any combination of letters, // excluding hyphens, symbols and punctuation, e.g. apostrophes "UPPERCASE_LETTER" (LOWERCASE_LETTER)* > Token;orth=upperInitial;kind=word; "UPPERCASE_LETTER" (UPPERCASE_LETTER)+ > Token;orth=allCaps;kind=word; "LOWERCASE_LETTER" (LOWERCASE_LETTER)* > Token;orth=lowercase;kind=word; // MixedCaps is any mixture of caps and small letters that doesn't // fit in the preceding categories ("LOWERCASE_LETTER" "LOWERCASE_LETTER"+"UPPERCASE_LETTER"+ \ (UPPERCASE_LETTER|LOWERCASE_LETTER)*)|\ ("LOWERCASE_LETTER" "LOWERCASE_LETTER"*"UPPERCASE_LETTER"+\ (UPPERCASE_LETTER|LOWERCASE_LETTER)*)|\ ("UPPERCASE_LETTER" "UPPERCASE_LETTER" (UPPERCASE_LETTER|LOWERCASE_LETTER)*\ ("LOWERCASE_LETTER")+ (UPPERCASE_LETTER|LOWERCASE_LETTER)*)|\ ("UPPERCASE_LETTER" "LOWERCASE_LETTER"+ ("UPPERCASE_LETTER"+ "LOWERCASE_LETTER"+))+\ > Token;orth=mixedCaps;kind=word; #numbers# // a number is any combination of digits "DECIMAL_DIGIT_NUMBER"+ >Token;kind=number; "OTHER_NUMBER"+ >Token;kind=number; #whitespace# (SPACE_SEPARATOR) >SpaceToken;kind=space; (CONTROL) >SpaceToken;kind=control; #symbols# (MODIFIER_SYMBOL|MATH_SYMBOL|OTHER_SYMBOL) > Token;kind=symbol; CURRENCY_SYMBOL > Token;kind=symbol;symbolkind=currency; #punctuation# "DASH_PUNCTUATION" >Token;kind=punctuation;subkind=dashpunct; (CONNECTOR_PUNCTUATION|OTHER_PUNCTUATION)>Token;kind=punctuation; ("START_PUNCTUATION"|"INITIAL_QUOTE_PUNCTUATION") >Token;kind=punctuation;position=startpunct; ("END_PUNCTUATION"|"FINAL_QUOTE_PUNCTUATION") >Token;kind=punctuation;position=endpunct;