GMRKB featurizeStrings.pl
Jump to navigation
Jump to search
A GMRKB featurizeStrings.pl is a short text string featurization system (that solves a short text string featurization task).
References
2013
#!/usr/bin/perl
#
# DESCRIPTION: a short text string featurization system
# AUTHOR: Gabor Melli
# DATE: 2013-01-16
# NOTES
# * it assumes that the input is a space-separated term and a label separated by a \tab character.
# REFERENCES
# * http://www.gabormelli.com/RKB/Short-Text-String_Featurization_System
# TODO
# * accept a dictionary of labeled terms.
# * also accept .CSV files (which likely assumes that text_strings are quoted).
use strict ;
sub textStringPattern($) ;
sub cmprsdTextStringPattern($) ;
my $caselessString=1 ; # support case-less text by casting upper-case to lower-case.
my $debug=0 ;
while (<STDIN>) {
chomp ;
my ($txtstr,$label)=split(/\t/) ;
$txtstr = lower($txtstr) if $caselessString ;
my @chars = split(//,$txtstr) ;
my $charCount = $#chars ;
my $char1 = $chars [0] ; # the first character
my $char1pattern = textStringPattern($char1) ;
my $charN = $chars [$charCount] ; # the last character
my $charNpattern = textStringPattern($charN) ;
my $patternString = textStringPattern($txtstr) ;
my $patternStringLen = length($patternString) ;
my $cmprsdPatternString = cmprsdTextStringPattern($patternString) ;
my $cmprsdPatternStringLen = length($cmprsdPatternString) ;
my $charsString = $txtstr; $charsString =~ s/[^a-z]//g ;
my $alphaChars = length($charsString) ;
my $hasAlphaChars=0; $hasAlphaChars = 1 if $alphaChars ;
my $numsString = $txtstr; $numsString =~ s/[^0-9]//g ;
my $numChars = length($numsString) ;
my $hasNumChars=0; $hasNumChars = 1 if $numChars ;
my $nonalphasString = $txtstr; $nonalphasString = s/[ 0-9a-z]//g ;
my $nonalphaChars = length($nonalphasString) ;
my @tokens = split(/ /) ;
my $tokenCount = $#tokens ;
# FUTURE features
my $tokensWithNumbers ;
my $tokensWithNonalphas ;
my $tokensWithAlphas ;
my $alltokensWithNumbers ;
my $alltokensWithNonalphas ;
my $alltokensWithAlphas ;
my $tokensMatchENCW ; # e.g. apple store => 2
my $tokensMatchSPCW ;
print "$txtstr" ;
print "\t$patternString\t$patternStringLen\t$cmprsdPatternString\t$cmprsdPatternStringLen" ;
print "\t$charCount" ;
print "\t$char1\t$char1pattern\t$charN\t$charNpattern" ;
print "\t$alphaChars\t$hasAlphaChars\t$numChars\t$hasNumChars\t$nonalphaChars" ;
print "\t$tokenCount" ;
print "\t$label\n" ;
}
sub textStringPattern($) { # ~Text Token Character Pattern Feature ([[Collins, 2002]]).
my $txtStr = shift() ;
$txtStr =~ s/[A-Z]/A/g ;
$txtStr =~ s/[a-z]/a/g ;
$txtStr =~ s/[0-9]/0/g ;
$txtStr =~ s/[^ a-z0-9]/-/g ;
return($txtStr) ;
}
sub cmprsdTextStringPattern($) {
my $txtStrPattern = shift() ;
$txtStrPattern =~ s/A+/A/g ;
$txtStrPattern =~ s/a+/a/g ;
$txtStrPattern =~ s/0+/0/g ;
$txtStrPattern =~ s/-+/-/g ;
return($txtStrPattern) ;
}