GMRKB featurizeStrings.pl
Jump to navigation
Jump to search
A GMRKB featurizeStrings.pl is a short text string featurization system (that solves a short text string featurization task).
References
2013
#!/usr/bin/perl # # DESCRIPTION: a short text string featurization system # AUTHOR: Gabor Melli # DATE: 2013-01-16 # NOTES # * it assumes that the input is a space-separated term and a label separated by a \tab character. # REFERENCES # * http://www.gabormelli.com/RKB/Short-Text-String_Featurization_System # TODO # * accept a dictionary of labeled terms. # * also accept .CSV files (which likely assumes that text_strings are quoted). use strict ; sub textStringPattern($) ; sub cmprsdTextStringPattern($) ; my $caselessString=1 ; # support case-less text by casting upper-case to lower-case. my $debug=0 ; while (<STDIN>) { chomp ; my ($txtstr,$label)=split(/\t/) ; $txtstr = lower($txtstr) if $caselessString ; my @chars = split(//,$txtstr) ; my $charCount = $#chars ; my $char1 = $chars [0] ; # the first character my $char1pattern = textStringPattern($char1) ; my $charN = $chars [$charCount] ; # the last character my $charNpattern = textStringPattern($charN) ; my $patternString = textStringPattern($txtstr) ; my $patternStringLen = length($patternString) ; my $cmprsdPatternString = cmprsdTextStringPattern($patternString) ; my $cmprsdPatternStringLen = length($cmprsdPatternString) ; my $charsString = $txtstr; $charsString =~ s/[^a-z]//g ; my $alphaChars = length($charsString) ; my $hasAlphaChars=0; $hasAlphaChars = 1 if $alphaChars ; my $numsString = $txtstr; $numsString =~ s/[^0-9]//g ; my $numChars = length($numsString) ; my $hasNumChars=0; $hasNumChars = 1 if $numChars ; my $nonalphasString = $txtstr; $nonalphasString = s/[ 0-9a-z]//g ; my $nonalphaChars = length($nonalphasString) ; my @tokens = split(/ /) ; my $tokenCount = $#tokens ; # FUTURE features my $tokensWithNumbers ; my $tokensWithNonalphas ; my $tokensWithAlphas ; my $alltokensWithNumbers ; my $alltokensWithNonalphas ; my $alltokensWithAlphas ; my $tokensMatchENCW ; # e.g. apple store => 2 my $tokensMatchSPCW ; print "$txtstr" ; print "\t$patternString\t$patternStringLen\t$cmprsdPatternString\t$cmprsdPatternStringLen" ; print "\t$charCount" ; print "\t$char1\t$char1pattern\t$charN\t$charNpattern" ; print "\t$alphaChars\t$hasAlphaChars\t$numChars\t$hasNumChars\t$nonalphaChars" ; print "\t$tokenCount" ; print "\t$label\n" ; } sub textStringPattern($) { # ~Text Token Character Pattern Feature ([[Collins, 2002]]). my $txtStr = shift() ; $txtStr =~ s/[A-Z]/A/g ; $txtStr =~ s/[a-z]/a/g ; $txtStr =~ s/[0-9]/0/g ; $txtStr =~ s/[^ a-z0-9]/-/g ; return($txtStr) ; } sub cmprsdTextStringPattern($) { my $txtStrPattern = shift() ; $txtStrPattern =~ s/A+/A/g ; $txtStrPattern =~ s/a+/a/g ; $txtStrPattern =~ s/0+/0/g ; $txtStrPattern =~ s/-+/-/g ; return($txtStrPattern) ; }