Friday, March 7, 2014

Porter Stemming: Pruning Words to Their Root

Stemming decreases the variety of words by finding the root or stem of a word.   For example, "house", "houses" and "housing" parse as separate words, but share the same root: "hous".  Using this stem reduce three words to one and may improve document matching A well known stemmer is by Martin Porter and is available on-line with a comprehensive list of sources and code including T-SQL by Keith Lubell.

Originally intended to use Mr Lubell's code in my text processing program, but I decided to code in (what I believe to be) a simpler form.  My objective was to incorporate all of the code required for the stemmer  into a single function rather than have stand alone tables and helper functions that support only the stemmer as does the available SQL.  I was only modestly successful having substituted case-when statements for the table, but as the helper functions are called repeatedly, and I know of no way to code a subroutine in T-SQL functions, the helper functions remain.

The helper functions and what they do:

  • fx_psPat converts a word into a string of v& c for verbs and consonants
  • fx_psMeas  calculates the measure of a word:  how often a vc pattern occurs
  • fx_psVl identifies if there is a vowel in the string
  • fx_psCVC identifies if the the last three letters of the stem are consonant- vowel-consonant
  • fx_psCC identifies if the stem ends in a double consonant
The primary function is fx_psWORD('word').   Below is a test query and the results:

select dbo.fx_psWORD('house')  house, dbo.fx_psWORD('houses') houses, dbo.fx_psWORD('housing') housing


house houses housing
hous hous hous



The code in its entirety:


Create function fx_psPat(@str varchar(48))
returns varchar(24)
as 
begin
declare @ctr smallint, @pat varchar(48)
select @ctr = 1, @pat = ''
while @ctr<=len(@str)
begin
set @pat = @pat+case 
when charindex(substring(@str,@ctr,1),'aeiou')>0 then 'v'  
when substring(@str,@ctr,1) = 'y' and @ctr>1 and charindex(substring(@str,@ctr-1,1), 'aeiou') = 0 then 'v'
else 'c' end
set @ctr = @ctr+1
end
return @pat
end

Create function fx_psMeas(@str varchar(48))
returns smallint
as 
begin
declare @ctr smallint, @flg binary, @m smallint
select @ctr = 1, @flg = 0, @m=0, @str = dbo.fx_psPat(@str)
while @ctr<=len(@str)
begin
if substring(@str,@ctr,1)='v' or @flg = 1
begin
set @flg = 1
if substring(@str,@ctr,1) = 'c' select @m = @m+1, @flg = 0
end
set @ctr = @ctr+1
end
return @m
end

Create function fx_psVl(@str varchar(48))
returns binary
as
begin
return (select case when  charindex( 'v', dbo.fx_psPat(@str))>0 then 1 else 0 end)
end


create function fx_psCVC(@str varchar(48))
returns binary
as
begin
return (select case when  len(@str)>2 and right(dbo.fx_psPat(@str),3)='cvc'  and charindex(right(@str,1),'wxy') = 0  then 1 else 0 end)
end


create function fx_psCC(@str varchar(48))
returns binary
as
begin
return (select case when  len(@str)>1 and ((right(dbo.fx_psPat(@str),2) ='cc' and substring(@str, len(@str),1)= substring(@str, len(@str)-1,1))  
or (right(@str,2) ='yy' and charindex(substring(@str, len(@str)-3,1),'aeiou')=0 ))  then 1 else 0 end)
end

alter function fx_psWORD(@str varchar(48))
returns varchar(48)
as
begin

set @str = lower(rtrim(ltrim(@str)))

if len(@str)>2
begin 

--step 1a
set @str = case
when len(@str)>4 and right(@str, 4) = 'sses'  then left(@str, len(@str)-4)+'ss'
when len(@str)>3 and right(@str, 3) = 'ies'  then left(@str, len(@str)-3)+ 'i'
when len(@str)>2 and right(@str, 2) = 'ss'  then @str
when len(@str)>1 and right(@str, 1) = 's'  then left(@str,len(@str)-1)
else @str end

--step 1b
set @str = case  
when len(@str)>3 and dbo.fx_psMeas(left(@str, len(@str)-3))>0  and right(@str,3) ='eed'  then  left(@str, len(@str)-3)+'ee'
when len(@str)>3 and right(@str,3) ='eed' then  @str
when len(@str)>2 and dbo.fx_psvl(left(@str,len(@str)-2))=1 and right(@str,2)='ed' then left(@str, len(@str)-2)+'\'
when len(@str)>3 and dbo.fx_psvl(left(@str,len(@str)-3))=1 and right(@str,3)='ing' then left(@str, len(@str)-3)+'\'
else @str end

if right(@str,1) = '\'
begin
set @str =replace(@str,'\','')
set @str = case
when  len(@str)>2 and right(@str,2) ='at' then left(@str, len(@str)-2)+'ate'
when  len(@str)>2 and right(@str,2) ='bl' then left(@str, len(@str)-2)+'ble'
when  len(@str)>2 and right(@str,2) ='iz' then left(@str, len(@str)-2)+'ize'
when  len(@str)>1 and dbo.fx_psCC(@str)=1 and right(@str,1) not in ('l','s','z')  then left(@str, len(@str)-1)
when  len(@str)>2 and dbo.fx_psmeas(@str)=1 and dbo.fx_psCVC(@str)=1  then @str+'e'
else @str end
end

--step 1c
set @str = case
when dbo.fx_psvl(left(@str,len(@str)-1))=1 and right(@str,1)='y' then left(@str, len(@str)-1)+'i'
else @str end

--step2
set @str = case 
when len(@str)>7 and dbo.fx_psMeas(left(@str,len(@str)-7))>0 and right(@str, 7)='ational' then left(@str,len(@str)-7)+'ate'
when len(@str)>6 and dbo.fx_psMeas(left(@str,len(@str)-6))>0 and right(@str, 6)='tional' then left(@str,len(@str)-6)+'tion'
when len(@str)>4 and dbo.fx_psMeas(left(@str,len(@str)-4))>0 and right(@str, 4)= 'enci' then left(@str,len(@str)-4)+'ence'
when len(@str)>4 and dbo.fx_psMeas(left(@str,len(@str)-4))>0 and right(@str, 4)='anci' then left(@str,len(@str)-4)+'ance'
when len(@str)>4 and dbo.fx_psMeas(left(@str,len(@str)-4))>0 and right(@str, 4)='izer' then left(@str,len(@str)-4)+'ize'
when len(@str)>3 and dbo.fx_psMeas(left(@str,len(@str)-3))>0 and  right(@str, 3)='bli' then left(@str,len(@str)-3)+'ble'
when len(@str)>4 and dbo.fx_psMeas(left(@str,len(@str)-4))>0 and right(@str, 4)='alli' then left(@str,len(@str)-4)+'al'
when len(@str)>5 and dbo.fx_psMeas(left(@str,len(@str)-5))>0 and right(@str, 5)='entli' then left(@str,len(@str)-5)+'ent'
when len(@str)>3 and dbo.fx_psMeas(left(@str,len(@str)-3))>0 and right(@str, 3)='eli' then left(@str,len(@str)-3)+'e'
when len(@str)>5 and dbo.fx_psMeas(left(@str,len(@str)-5))>0 and right(@str, 5)='ousli' then left(@str,len(@str)-5)+'ous'
when len(@str)>7 and dbo.fx_psMeas(left(@str,len(@str)-7))>0 and right(@str, 7)='ization' then left(@str,len(@str)-7)+'ize'
when len(@str)>5 and dbo.fx_psMeas(left(@str,len(@str)-5))>0 and right(@str, 5)='ation' then left(@str,len(@str)-5)+'ate'
when len(@str)>4 and dbo.fx_psMeas(left(@str,len(@str)-4))>0 and right(@str, 4)='ator' then left(@str,len(@str)-4)+'ate'
when len(@str)>5 and dbo.fx_psMeas(left(@str,len(@str)-5))>0 and right(@str, 5)='alism' then left(@str,len(@str)-5)+'al'
when len(@str)>7 and dbo.fx_psMeas(left(@str,len(@str)-7))>0 and right(@str, 7)='iveness' then left(@str,len(@str)-7)+'ive'
when len(@str)>7 and dbo.fx_psMeas(left(@str,len(@str)-7))>0 and right(@str, 7)='fulness' then left(@str,len(@str)-7)+'ful'
when len(@str)>7 and dbo.fx_psMeas(left(@str,len(@str)-7))>0 and right(@str, 7)='ousness' then left(@str,len(@str)-7)+'ous'
when len(@str)>5 and dbo.fx_psMeas(left(@str,len(@str)-5))>0 and right(@str, 5)='aliti' then left(@str,len(@str)-5)+'al'
when len(@str)>5 and dbo.fx_psMeas(left(@str,len(@str)-5))>0 and right(@str, 5)='iviti' then left(@str,len(@str)-5)+'ive'
when len(@str)>6 and dbo.fx_psMeas(left(@str,len(@str)-6))>0 and  right(@str, 6)='biliti' then left(@str,len(@str)-6)+'ble'
else @str end


--step 3
set @str = case 
when len(@str)>4 and dbo.fx_psMeas(left(@str,len(@str)-4)) >0 and right(@str, 4)='logi' then left(@str,len(@str)-4)+'log'
when len(@str)>5 and dbo.fx_psMeas(left(@str,len(@str)-5)) >0 and right(@str, 5)='icate' then left(@str,len(@str)-5)+'ic'
when len(@str)>5 and dbo.fx_psMeas(left(@str,len(@str)-5)) >0 and right(@str, 5)='ative' then left(@str,len(@str)-5)
when len(@str)>5 and dbo.fx_psMeas(left(@str,len(@str)-5)) >0 and right(@str, 5)='alize' then left(@str,len(@str)-5)+'al'
when len(@str)>5 and dbo.fx_psMeas(left(@str,len(@str)-5)) >0 and right(@str, 5)='iciti' then left(@str,len(@str)-5)+'ic'
when len(@str)>4 and dbo.fx_psMeas(left(@str,len(@str)-4)) >0 and right(@str, 4)='ical' then left(@str,len(@str)-4)+'ic'
when len(@str)>3 and dbo.fx_psMeas(left(@str,len(@str)-3)) >0 and right(@str, 3)='ful' then left(@str,len(@str)-3)
when len(@str)>4 and dbo.fx_psMeas(left(@str,len(@str)-4)) >0 and right(@str, 4)='ness' then left(@str,len(@str)-4)
else @str end

-- step 4
set @str = case 
when len(@str)>5 and dbo.fx_psMeas(left(@str,len(@str)-5)) >1 and  right(@str, 5)='ement'then left(@str,len(@str)-5)
when len(@str)>5 and right(@str, 5)='ement'then @str
when len(@str)>4 and dbo.fx_psMeas(left(@str,len(@str)-4)) >1 and  right(@str, 4) in ('ance','ence','able','ible','ment') then left(@str,len(@str)-4)
when len(@str)>4 and  right(@str, 4) in ('ance','ence','able','ible','ment') then @str
when len(@str)>3 and dbo.fx_psMeas(left(@str,len(@str)-3)) >1 and  right(@str, 3)='ion'and right(left(@str,len(@str)-3),1) in ('s','t') then left(@str,len(@str)-3)
when len(@str)>3 and dbo.fx_psMeas(left(@str,len(@str)-3)) >1 and  right(@str, 3) in ('ent','ism','ate','iti','ous','ive','ize','ant') then left(@str,len(@str)-3)
when len(@str)>2 and dbo.fx_psMeas(left(@str,len(@str)-2)) >1 and right(@str, 2)in ('al','er','ic','ou') then left(@str,len(@str)-2)
else @str end

-- step 5a
set @str = case 
when len(@str)>1 and dbo.fx_psMeas(left(@str,len(@str)-1))>1 and right(@str,1) = 'e' then left(@str,len(@str)-1)
when len(@str)>1 and dbo.fx_psMeas(left(@str,len(@str)-1))=1 and dbo.fx_psCVC(left(@str,len(@str)-1)) = 0 and right(@str,1) = 'e' then left(@str,len(@str)-1)
else @str end

-- step 5b
set @str = case 
when len(@str)>1 and dbo.fx_psMeas(left(@str,len(@str)-1))>1 and dbo.fx_psCC(@str)=1 and  right(@str,1) = 'l' then left(@str,len(@str)-1)
else @str end

end
return @str
end 



3 comments:

  1. Hello, just wanted to mention, I liked this post.
    It was funny. Keep on posting!

    Feeel free to visit my web-site ... click the next site

    ReplyDelete

  2. Very good article . Thanks for sharing.
    Snowflake Training
    Snowflake Training in Hyderabad
    Snowflake Online Training
    Snowflake Online Training Hyderabad
    Snowflake Training Online
    Snowflake Training in Ameerpet
    Snowflake Training Institute in Hyderabad

    ReplyDelete