Python Program Template
A Python Program Template is a software program template that is a Python program.
- Counter-Example(s):
- See: Python Code.
References
- !/usr/bin/python
- Template program
debug = 1
if debug>=1: print "DEBUG: " . debug
- LIBRARIES
import pandas as pd
if debug: print "pandas version: " + pd.__version__ # pandas version: 0.14.1
from pandas import DataFrame, Series
import numpy as np
if debug: print "numpy version: " + np.__version__ # numpy version: 1.8.1
from numpy import random # random
from re import split
import sklearn as skl
if debug: print "sklearn version: " + sklearn.__version__ # sklearn version: 0.15.0
- from sklearn import preprocessing, svm, cross_validation # labelEncoder
- from sklearn.metrics import accuracy_score
- from sklearn.feature_extraction.text import CountVectorizer
- import sklearn.feature_extraction.text
- from sklearn.externals import joblib
import gc
- GLOBALS
from datetime import datetime
from time import time
dstamp=datetime.now().strftime("%y%m%d")
if debug: print "dstamp=" + dstamp # dstamp=141113
- dstamp="141106"
tstamp=datetime.now().strftime("%y%m%d%H%M%S")
if debug: print "tstamp=" + tstamp # tstamp=141113032844
dataDir = "../data/"
inFile="file.tsv"
if debug>=2: minDictionaryTermsFilter=1
array = ['val1','val2']
df = pd.DataFrame([
{ 'colId':0, colNameRecordSource : 'VIG', colNameRecordType: 'taxoPath', 'dataFilename': 'pcTaxo_lbld2_VIG.141114.tsv'},
{ 'colId':1, colNameRecordSource : 'EB', colNameRecordType: 'taxoPath', 'dataFilename': 'pcTaxo_lbld2_EB.141030b.tsv'},
])
df.set_index('colId', inplace=True)
filesToProcess = pd.Series([2]) ;
def getLabeledData(dataFilename, dataSource, debug):
# assumes: columns not empty
# assumes: column headers are present
df_tmp = pd.read_csv(dataDir + dataFilename, delimiter='\t', skipinitialspace=True)[cols_to_keep]
df_tmp.columns = [colNameRecordTextContent,colNameRecordLabel]
df_tmp[colNameRecordSource]=dataSource
if debug>=3: print "--------------------\dataSource =", dataSource, " rows =", len(df_tmp), " sample\n", df_tmp.loc[random.choice(df_tmp.index, 3, replace=False)]
return(df_tmp)