Python Program Template

From GM-RKB
Revision as of 21:11, 13 February 2015 by Gmelli (talk | contribs) (Created page with "A Python Program Template is a software program template that is a Python program. * <B>Counter-Example(s):</B> ** Perl Program Template. * <B>See:</B> Pytho...")
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigation Jump to search

A Python Program Template is a software program template that is a Python program.



References

  1. !/usr/bin/python
  1. Template program

debug = 1 if debug>=1: print "DEBUG: " . debug

  1. LIBRARIES

import pandas as pd if debug: print "pandas version: " + pd.__version__ # pandas version: 0.14.1 from pandas import DataFrame, Series

import numpy as np if debug: print "numpy version: " + np.__version__ # numpy version: 1.8.1 from numpy import random # random

from re import split

import sklearn as skl if debug: print "sklearn version: " + sklearn.__version__ # sklearn version: 0.15.0

  1. from sklearn import preprocessing, svm, cross_validation # labelEncoder
  2. from sklearn.metrics import accuracy_score
  3. from sklearn.feature_extraction.text import CountVectorizer
  4. import sklearn.feature_extraction.text
  5. from sklearn.externals import joblib

import gc

  1. GLOBALS

from datetime import datetime from time import time dstamp=datetime.now().strftime("%y%m%d") if debug: print "dstamp=" + dstamp # dstamp=141113

  1. dstamp="141106"

tstamp=datetime.now().strftime("%y%m%d%H%M%S") if debug: print "tstamp=" + tstamp # tstamp=141113032844

dataDir = "../data/"

inFile="file.tsv" if debug>=2: minDictionaryTermsFilter=1

array = ['val1','val2']

df = pd.DataFrame([

  { 'colId':0, colNameRecordSource : 'VIG',       colNameRecordType: 'taxoPath',  'dataFilename': 'pcTaxo_lbld2_VIG.141114.tsv'},
  { 'colId':1, colNameRecordSource : 'EB',        colNameRecordType: 'taxoPath',  'dataFilename': 'pcTaxo_lbld2_EB.141030b.tsv'},

]) df.set_index('colId', inplace=True)

filesToProcess = pd.Series([2]) ;

def getLabeledData(dataFilename, dataSource, debug):

   # assumes: columns not empty
   # assumes: column headers are present
   df_tmp = pd.read_csv(dataDir + dataFilename, delimiter='\t', skipinitialspace=True)[cols_to_keep]
   df_tmp.columns = [colNameRecordTextContent,colNameRecordLabel]
   df_tmp[colNameRecordSource]=dataSource 
   if debug>=3: print "--------------------\dataSource =", dataSource, " rows =", len(df_tmp), " sample\n", df_tmp.loc[random.choice(df_tmp.index, 3, replace=False)]
   return(df_tmp)