tok2WikiFormat.pl

A tok2WikiFormat.pl is a Perl-based Token-to-Wiki program.
See: Perl Program, Tokenization System.
References

#!/usr/bin/perl 

use strict ;
my $PROGRAM = "_tokToWikiFormat.pl" ;
my $CONTACT = '_tokToWikiFormat@at@gabormelli.com' ;
my $VERSION	= '1.1' ;
my $VERSIONDATE = "2009.11.11" ;

#########################################################################################
# == SYNOPSIS:
# * Convert tokenFile format to wikiFile format.
#
# == USAGE
# * Input: tokenFile
# * Input: SynonymToConceptId (hardcoded)
#
# == TODO
# * support the reading of a header (with column names)
#########################################################################################

#use warnings ; 

# PACKAGES
use lib '..' ;
use lib './lib' ;
use lib '../lib' ;
use TABularDataFileMgmt ;
use TGQueryDataMgmt ;
use Getopt::Long;

#########################################################################################
# GLOBAL SCALARS
my $debug = 2 ;
$TABularDataFileMgmt::debug = $debug ;
TABularDataFileMgmt::settimestamp() ;
my $runningSeconds_=TABularDataFileMgmt::getrunningseconds() ;

my $SynonymToConceptIdFile="SynonymToConceptId.dat" ;
my $removeDocumentId ;

# GLOBAL ARRAYS
my @Token    ;
my @Document ;
my @Sentence ;

# Some standard column configurations
my @TokFileAttrsType ;
$TokFileAttrsType[1] ="Token POS Parse ConceptType ConceptId OntologyId BaseVerb SRL1 SRL2 SRL3 SRL4 SRL5 SRL6 SRL7 SRL8" ;
$TokFileAttrsType[2] ="DocumentExtId Token POS Parse ConceptType ConceptId OntologyId BaseVerb SRL1 SRL2 SRL3 SRL4 SRL5 SRL6 SRL7 SRL8" ;
$TokFileAttrsType[3] ="Token BaseWord POS Parse ConceptType ConceptId OntologyId BaseVerb SRL1 SRL2 SRL3 SRL4 SRL5 SRL6 SRL7 SRL8" ;
$TokFileAttrsType[4] ="DocumentExtId Token BaseWord POS Parse ConceptType ConceptId OntologyId BaseVerb SRL1 SRL2 SRL3 SRL4 SRL5 SRL6 SRL7 SRL8" ;
$TokFileAttrsType[5] ="Token POS Parse BIO ConceptType ConceptId" ;
$TokFileAttrsType[7] ="Token POS Parse Oe1 Oe2 Og2 ChrCntCat CapsCnt NumCnt SpeChrCnt BIO" ;
$TokFileAttrsType[8] ="Token POS Parse Oe1 Oe2 Og2 ChrCntCat CapsCnt NumCnt SpeChrCnt BIO ConceptType ConceptId" ;
$TokFileAttrsType[9] ="Token POS       Oe1 Oe2 Og2 ChrCntCat CapsCnt NumCnt SpeChrCnt BIO BIOp" ;
$TokFileAttrsType[10]="Token POS       Oe1 Oe2 Og2 ChrCntCat CapsCnt NumCnt SpeChrCnt BIO ConceptType ConceptId" ;

# Token input file description
my $inputTokFileAttrs ; # will eventually hold the attribute names for the (token) input file
my @InputTokFileAttrs ; 
my $inputTokFileAttrsTypeId = 10 ; # the default configuration

my $inputTokFile = "-" ; 
my $outputFile = "-" ; 
my $verbose = 0 ;
my $help = 0 ;
GetOptions (
       'f|inputTokFile=s'	    => \$inputTokFile,
       'o|outputFile:s'		    => \$outputFile,
       'removeDocumentId=s'	    => \$removeDocumentId,
       'inputTokFileAttrsTypeId:s'  => \$inputTokFileAttrsTypeId ,
       'inputTokFileAttrs:s'        => \$inputTokFileAttrs ,
       'd|debug=i'            	    => \$debug,
       'v|verbose+'         	    => \$verbose,
       'h|help'         	    => \$help
);

my $USAGE ;
$USAGE .= "USAGE:\n" ;
$USAGE .= "   Program info: Version: $VERSION($VERSIONDATE)   Contact: $CONTACT\n" ;
$USAGE .= "   Example: ./$PROGRAM -r=svml -f=All -d=1 -p train\n" ;
$USAGE .= "   PARAMS:\n" ;
$USAGE .= "      inputTokFile:s def[$inputTokFile]\n" ;
$USAGE .= "      inputTokFileAttrsTypeId:s def[$inputTokFileAttrsTypeId]\n" ; 
$USAGE .= "      inputTokFileAttrs:s def[$inputTokFileAttrs]\n" ;
$USAGE .= "      outputFile:s def[$outputFile]\n" ;
$USAGE .= "      d|debug:i def[$debug]\n" ;
$USAGE .= "      v|verbose+\n" ; 
$USAGE .= "      h|help\n" ;

if ($help) {
   print $USAGE ;
   exit ;
}

#########################################################################################
# Process inputed attributes

$debug=$debug+$verbose ;
if ($debug>0) {
   print "DEBUG: Debugging enabled and set to level: $debug.\n" ;
}

# Test the inputTokFile parameter
if (not defined($inputTokFile) ) {
   print $USAGE ;
   die ("ERROR a0djla: requires that --inputTokFile parameter be provided.\n\n") ;
}
if (-d $inputTokFile ) {
   die("\nERROR aolkdj: The InputFile parameter must be a file not a directory [$inputTokFile].\n") ;
}
if (not open (INPUTFILE, "< $inputTokFile") ) {
   die("\nERROR oxlkkd: could not open input file [$inputTokFile].\n") ;
}
close(INPUTFILE) ; # leave the opening to the subroutine

# Test the outputFile parameter
my $outputFileHandle ;
if (defined($outputFile) and not ($outputFile =~ /\-/)) {
   if (-d $outputFile ) {
      die("\nERROR lasjii: The OutputFile parameter must be a file not a directory [$outputFile].\n") ;
   }
   if (not open ($outputFileHandle, ">", $outputFile) ) {
      die("\nERROR aanxks: could not open output file [$outputFile].\n") ;
   }
}

if (not defined $inputTokFileAttrs) {
   $inputTokFileAttrs = $TokFileAttrsType[$inputTokFileAttrsTypeId] ;
}
@InputTokFileAttrs = split ('\s+', $inputTokFileAttrs) ; # array of attribute names
print "DEBUG:  inputTokFileAttrs=[@InputTokFileAttrs]\n" if $debug>=3 ;

##############
# not using TAB module ... yet
print "DEBUG: Read in the SynonymToConceptIdFile[$SynonymToConceptIdFile]\n" if $debug>=1 ;
my %ConceptIdToPreferredName ;
if (-e $SynonymToConceptIdFile ) {
	open FILEH, "<$SynonymToConceptIdFile" or die "ERROR: Could not open [$SynonymToConceptIdFile] !*\n" ;
  while (<FILEH>) {
  	print "." if $debug>=4 ;
    my @R_ = split /\t/;
    ##ConceptName    ConceptId       Preferred
    #-       5302    0
    #1-Nearest Neighbor Range Search Task    4080    1

    next if $R_[2] != "1" ; # We only need the preferred name

    my $conceptId_       = $R_[1] ;
    my $conceptPrefName_ = $R_[0] ;

    $ConceptIdToPreferredName{$conceptId_} = $conceptPrefName_ ;
    print "DEBUG: ConceptIdToPreferredName{$conceptId_} = $conceptPrefName_\n" if $debug>=4 ;
  }
}
else {
   	print "DEBUG: not found [$SynonymToConceptIdFile]\n" if $debug>=0 ;
}
print "\n" if $debug>=4 ;

################################################################
################################################################
# Read-in the token input file and populate some prelim structures
$runningSeconds_=TABularDataFileMgmt::getrunningseconds() ;
print "DEBUG: [$runningSeconds_]s READ IN OUR DOCUMENT [$inputTokFile] with [" if $debug>=1 ;
TABularDataFileMgmt::ReadTabularFileIntoArrayOfHashes($inputTokFile, \@Token, \@InputTokFileAttrs) ;
print $#Token . "] records.\n" if $debug>=1 ;

if ($debug>=3) {
   print "\nTokens :\n" ;
   TABularDataFileMgmt::WriteArrayOfHashesToTabularFile("-", \@Token, \@InputTokFileAttrs );
}

# Populate the @Sentence structure
$runningSeconds_=TABularDataFileMgmt::getrunningseconds() ;
print "DEBUG: [$runningSeconds_]secs EXTRACT THE DOCUMENT AND SENTENCE INFORMATION\n" if $debug>=2 ;

# This is an overkill.
TGQueryDataMgmt::PopulateDocumentSentence(\@Token, \@Document, \@Sentence) ;

if ($debug>=2) {
  print "DEBUG:   COMPLETED: Documents=[" ;
  print 1+$#Document;
  print "]  Sentences=[" ;
  print 1+$#Sentence;
  print "]\n" ;
}

##############
my $rawRestiched = "" ; # will contain the initial raw restiched results.
my $documentId ; # currently just for debugging
my $docSectionFlag=0 ; # special handling when a documentId is encountered.
$docSectionFlag=3 if $removeDocumentId ;
my $prevBioTag='O' ;
my $prevConceptId ;
my $cMAnchorText = "" ;
my $rawTokenId_ = 0 ;
print "DEBUG: PROCESS the inputTokFile\n" if $debug>=1 ;

for my $tokenId_ (0 .. $#Token) {
	$rawTokenId_++ ;

  my $token_        = $Token[$tokenId_]{Token} ;
  my $bioTag_       = $Token[$tokenId_]{BIO} ;
  my $conceptIdTxt_ = $Token[$tokenId_]{ConceptId} ;
  my $conceptId_ = $conceptIdTxt_ ; 
  $conceptId_ =~ s/[\(\)\s\-]//g ;
  undef $conceptId_ if ($conceptId_ =~ m/UNLINKED/) ;
  print "DEBUG:  token[$token_] bioTag[$bioTag_] conceptId[$conceptId_] cMAnchorText[$cMAnchorText] prevConceptId[$prevConceptId] docSectionFlag[$docSectionFlag]\n" if $debug>=2 ;

  my $nextBioTag_   = $Token[$tokenId_+1]{BIO} ;

  ######## HANDLE DOCUMENT IDENTIFICATION SECTION #########
  if ($token_ =~ m/^(\d\d\d\d_[^\s]+)/ or $token_ =~ m/^(10.\d\d\d\d[^\s]+)/) { # the pattern to detect a new document and extract the document identifier
    $rawRestiched .= "\n$token_\t";
    $documentId = $1 ;
    $documentId =~ s/\//_/g ;
    print "DEBUG:   Found document start [$documentId]\n" if $debug>=3;
    $docSectionFlag=2 ;
    next ;
  }

  # test whether we need to skip the document section.
  if ($removeDocumentId and $docSectionFlag) {
  	if ($docSectionFlag > 1) {
  	  $docSectionFlag = 1 if ($token_ =~ /[\s]*/ and $token_ =~ /[^\w\d]/) ;
      next ;
  	}

  	if ($token_ =~ /[^\s]/) { # Look for the first content token.
      $docSectionFlag=0 ;
      print "DEBUG:   In document content section\n" if $debug>=3;
    }
    else {
    	next ;
    }
  }
  ############################################################

  # Mark the end of a concept mention (on the previous token).
  if ($bioTag_ =~ m/[^I]/ and $prevBioTag =~ m/[^O]/) { # if we are not in a continued span (bioTag==I) and the previous one is not in a span (prevBioTag==O).
  	print "DEBUG:   End of a concept mention\n" if $debug>=3 ;
    my $conceptName_  ;
    if ($prevConceptId) {
      $conceptName_ = $ConceptIdToPreferredName{$prevConceptId} ;
      print "DEBUG:   prevConceptId[$prevConceptId] => prevConceptName[$conceptName_]\n" if $debug>=2 ;
      die "ERROR: unrecog concept name for conceptId[$prevConceptId]\n" if (not defined $conceptName_) ;
      undef $prevConceptId ;
    }
    if (defined $conceptName_) { $rawRestiched .= $conceptName_ . "|" . $cMAnchorText }
    else                       { $rawRestiched .= $cMAnchorText }
    $rawRestiched .= "]]” 
    undef $cMAnchorText ;
  }
  # Can now commit to a conceptId in the current token (if present).
  $prevConceptId = $conceptId_ if ($conceptId_ =~ m/\d/ and not $conceptId_ =~ m/\w/) ;

  if ($bioTag_ =~ m/B/) { # Mark the start of a concept mention
    $rawRestiched .= " [[" if $bioTag_ =~ m/B/ ;
    $prevConceptId  = $conceptId_ if ($conceptId_ =~ m/\d/) ;
    $cMAnchorText = $token_ ;
  	print "DEBUG:   mark the start of a concept mention (token[$token_]/bioTag=[$bioTag_])\n" if $debug>=4 ;
  }
  elsif ($bioTag_ =~ m/O/) { # Commit the token (insert appropriate spacing)
#  if (not $cMAnchorText ) {
  	print "DEBUG:   commit the token [$token_] (not [$cMAnchorText])\n" if $debug>=3 ;
    my $space_ = " " ;
    $rawRestiched .= "$space_$token_" 
  }
  elsif ($bioTag_ =~ m/I/) {
    $cMAnchorText .= " $token_" if defined $cMAnchorText ;
    my $space_ = " " ;
    $rawRestiched .= $space_ ;
  	print "DEBUG:   insert the space between [$cMAnchorText])\n" if $debug>=3 ;
  }
  elsif (not defined $bioTag_) { # Mark an end of sentence.
    $rawRestiched .= " <\/s> " ;
  	print "DEBUG:   mark the end of a sentence (token[$token_]/bioTag=[$bioTag_])\n" if $debug>=3 ;
  }
  else {
  	die "ERROR(laks0hs): bioTag[$bioTag_]\n" ;
  }

  $prevBioTag=$bioTag_ ;
}

####################################

$_ = $rawRestiched ;
# recreate the typographical look&feel (e.g. periods, commas, ...)
   s/ +/ /g ; 
   s/ \. <\/s>/. <\/s>/g ; 
   s/[ ]+([\,:;"\)])/$1/g ; 
   s/(\() /$1/g ; 
   s/\[\ [[]*?<([\/]*?)i>/<$1i>[[/gi ; 
   s/<([\/])*?i>[ ]*?\]\]/]]<$1i>/gi ; 
   s/\[\ [[]+?/[[/g ; 
   s/[ ]+?\]\]/]]/g ; 
   s/\[\ [[]*?\]\]//g ; 

# reproduce the items that were reformated for convinience (e.g. _BR_ Yahoo_)
   s/_BR_/<BR>/g ; 
   s/_sq_/``/g ; 
   s/_eq_/''/g ; 
   s/_#_/ # /g ; 
   s/_7_(.+?)_7_/&#$1;/g ; 
   s/Yahoo_/Yahoo!/g ; 

# fixup some simple mistakes
# remove brackets around zero or one characters
   s/\[\[(.{0,1})\]\]/$1/g ; 

# ?? (stale?)
# append a <tab> after 
   s/2008_(\d+?)\. [\s]+?<\/s>[\s]*?/\n2008_$1\t/g ; 

if (defined $outputFileHandle) {
  print $outputFileHandle $_ ;
} else {
  print STDOUT $_ ;
}
tok2WikiFormat.pl

References

Navigation menu

Search