############################################################################### # # Porter's stemming algorithm # # SYNOPSIS # # require 'stem.pl'; # @stems = stem( @words ); # # DESCRIPTION # # This routine applies the Porter Stemming Algorithm to its parameters, # returning the stemmed words. # It is derived from the C program "stemmer.c" # as found in freewais and elsewhere, which contains these notes: # # Purpose: Implementation of the Porter stemming algorithm documented # in: Porter, M.F., "An Algorithm For Suffix Stripping," # Program 14 (3), July 1980, pp. 130-137. # Provenance: Written by B. Frakes and C. Cox, 1986. # # I have re-interpreted areas that use Frakes and Cox's "WordSize" # function. My version may misbehave on short words starting with "y", # but I can't think of any examples. # # The step numbers correspond to Frakes and Cox, and are probably in # Porter's article (which I've not seen). # Porter's algorithm still has rough spots (e.g current/currency, -ings words), # which I've not attempted to cure, although I have added # support for the British -ise suffix. # # NOTES # # This is version 0.1. I would welcome feedback, especially improvements # to the punctuation-stripping step. # # AUTHOR # # Ian Phillipps # # COPYRIGHT # # Copyright Public IP Exchange Ltd (PIPEX). # Available for use under the same terms as perl. # ############################################################################### sub stem { my @parms = @_; foreach( @parms ) { $_ = lc $_; # Step 0 - remove punctuation s/'s$//; s/^[^a-z]+//; s/[^a-z]+$//; next unless /^[a-z]+$/; # step1a_rules if( /[^s]s$/ ) { s/sses$/ss/ || s/ies$/i/ || s/s$// } # step1b_rules. The business with rule==106 is embedded in the # boolean expressions here. (/[aeiouy][^aeiouy].*eed$/ && s/eed$/ee/ ) || ( s/([aeiou].*)ed$/$1/ || s/([aeiouy].*)ing$/$1/ ) && ( # step1b1_rules s/at$/ate/ || s/bl$/ble/ || s/iz$/ize/ || s/bb$/b/ || s/dd$/d/ || s/ff$/f/ || s/gg$/g/ || s/mm$/m/ || s/nn$/n/ || s/pp$/p/ || s/rr$/r/ || s/tt$/t/ || s/ww$/w/ || s/xx$/x/ || # This is wordsize==1 && CVC...addanE... s/^[^aeiouy]+[aeiouy][^aeiouy]$/$&e/ ) #DEBUG && warn "step1b1: $_\n" ; # step1c_rules #DEBUG warn "step1c: $_\n" if s/([aeiouy].*)y$/$1i/; # step2_rules if ( s/ational$/ate/ || s/tional$/tion/ || s/enci$/ence/ || s/anci$/ance/ || s/izer$/ize/ || s/iser$/ise/ || s/abli$/able/ || s/alli$/al/ || s/entli$/ent/ || s/eli$/e/ || s/ousli$/ous/ || s/ization$/ize/ || s/isation$/ise/ || s/ation$/ate/ || s/ator$/ate/ || s/alism$/al/ || s/iveness$/ive/ || s/fulnes$/ful/ || s/ousness$/ous/ || s/aliti$/al/ || s/iviti$/ive/ || s/biliti$/ble/ ) { my ($l,$m) = ($`,$&); #DEBUG warn "step 2: l=$l m=$m\n"; $_ = $l.$m unless $l =~ /[^aeiou][aeiouy]/; } # step3_rules if ( s/icate$/ic/ || s/ative$// || s/alize$/al/ || s/iciti$/ic/ || s/ical$/ic/ || s/ful$// || s/ness$// ) { my ($l,$m) = ($`,$&); #DEBUG warn "step 3: l=$l m=$m\n"; $_ = $l.$m unless $l =~ /[^aeiou][aeiouy]/; } # step4_rules if ( s/al$// || s/ance$// || s/ence$// || s/er$// || s/ic$// || s/able$// || s/ible$// || s/ant$// || s/ement$// || s/ment$// || s/ent$// || s/sion$/s/ || s/tion$/t/ || s/ou$// || s/ism$// || s/ate$// || s/iti$// || s/ous$// || s/ive$// || s/ize$// || s/ise$// ) { my ($l,$m) = ($`,$&); # Look for two consonant/vowel transitions # NB simplified... #DEBUG warn "step 4: l=$l m=$m\n"; $_ = $l.$m unless $l =~ /[^aeiou][aeiouy].*[^aeiou][aeiouy]/; } # step5a_rules #DEBUG warn("step 5a: $_\n") && s/e$// if ( /[^aeiou][aeiouy].*[^aeiou][aeiouy].*e$/ || ( /[aeiou][^aeiouy].*e/ && ! /[^aeiou][aeiouy][^aeiouwxy]e$/) ); # step5b_rules #DEBUG warn("step 5b: $_\n") && s/ll$/l/ if /[^aeiou][aeiouy].*[^aeiou][aeiouy].*ll$/; # Cosmetic step s/(.)i$/$1y/; } @parms; } 1;