Python Program Template

From GM-RKB
Jump to navigation Jump to search

A Python Program Template is a software program template that is a Python program.



References

#!/usr/bin/python

# Template program

debug = 1
if (debug>=1): print "DEBUG: ", debug

#############################
# LIBRARIES

import pandas as pd # for tabular data processing
if (debug>=1): print "pandas version: ", pd.__version__   # pandas version: 0.14.1
from pandas import DataFrame, Series

import numpy as np # for numerical processing
if (debug>=1): print "numpy version: ", np.__version__    # numpy version: 1.8.1
from numpy  import random # random
from re     import split # for pattern-based text splitting (e.g. on \tabs).
import gc # garbage collection

import sklearn as skl # for machine learning
if (debug>=1): print "sklearn version: ", skl.__version__    # sklearn version: 0.15.0
# from sklearn import preprocessing, svm, cross_validation  # labelEncoder
# from sklearn.metrics import accuracy_score
# from sklearn.feature_extraction.text import CountVectorizer
# import sklearn.feature_extraction.text 
# from sklearn.externals import joblib

###########################
# GLOBALS

from datetime import datetime
from time import time
dstamp=datetime.now().strftime("%y%m%d")
if (debug>=1): print "dstamp=" + dstamp # dstamp=141113
#dstamp="141106"
tstamp=datetime.now().strftime("%y%m%d%H%M%S")
if (debug>=1): print "tstamp=" + tstamp # tstamp=141113032844

dataDir = "../data/"

inFile="file.tsv"
if (debug>=2): minDictionaryTermsFilter=1

array = ['val1','val2']

df = pd.DataFrame([
   { 'colId':0, 'colNameRecordSource' : 'VIG',    'colNameRecordType': 'taxoPath', 'dataFilename': 'pcTaxo_lbld2_VIG.141114.tsv'},
   { 'colId':1, 'colNameRecordSource' : 'EB',     'colNameRecordType': 'taxoPath', 'dataFilename': 'pcTaxo_lbld2_EB.141030b.tsv'},
])
df.set_index('colId', inplace=True)

filesToProcess = pd.Series([2]) ;

def getLabeledData(dataFilename, dataSource, debug):
    # assumes: columns not empty
    # assumes: column headers are present
    df_tmp = pd.read_csv(dataDir + dataFilename, delimiter='\t', skipinitialspace=True)[cols_to_keep]
    df_tmp.columns = [colNameRecordTextContent,colNameRecordLabel]
    df_tmp[colNameRecordSource]=dataSource 
    if debug>=3: print "--------------------\dataSource =", dataSource," rows =", len(df_tmp)," sample\n", df_tmp.loc[random.choice(df_tmp.index, 3, replace=False)]
    return(df_tmp)