Gabor Melli's v2nodeClassify.TrainEval.150107
Jump to navigation
Jump to search
v2nodeClassify.TrainEval.150107
if debug>=2:
print df_labeledData.loc[2:2]['taxoPath']
# confirm that the [[vector]]s have not been jumbled.
df_vectorizedTokens.loc[testRecordId]['ut_asian'] # zirconia?
# simple test to see the features at work
unigramVectorizer.transform(['00 09 electronics']).toarray() # array([[1, 0, 1, ..., 0, 0, 0]], dtype=int64)
if debug>=4:
df_featuresTable_EB=df_extrFeatures [(df_labeledData.taxoCode=='EB')][cols2retainMc]
#df_featuresTable_EB.loc[random.choice(df_featuresTable_EB.index, 5, replace=False)]
print len(df_featuresTable_EB) #
df_featuresTable_SHZ=df_extrFeatures [(df_labeledData.taxoCode=='SHZ')][cols2retainMc]
#df_featuresTable_SHZ.loc[random.choice(df_featuresTable_SHZ.index, 5, replace=False)]
print len(df_featuresTable_SHZ) #
if debug>=2:
ndarr_preds_EB_SHZ = svm_clfr_EB.predict(df_featuresTable_SHZ)
accuracy_score(targetEncoder.transform(ndarr_labelData_SHZ), targetEncoder.transform(ndarr_preds_EB_SHZ)
# 0.4881 0.2768
ndarr_preds_SHZ_EB = svm_clfr_SHZ.predict(df_featuresTable_EB)
accuracy_score(targetEncoder.transform(ndarr_labelData_EB), targetEncoder.transform(ndarr_preds_SHZ_EB)
# 0.08424 0.139
Gabor_Melli's_v2nodeClassify.TrainEval.150107 is a supervised prediction of product taxonomy node system.
References
2015
TRAIN (AND EVALUATE) A TAXO-NODE CLASSIFICATION MODEL¶
This notebook represents a supervised product taxonomy classification system (that is based on a one-vs-rest supervised classification algorithm )
To Do
- Add text preprocessing
- Make featurization a subroutine
- Include a feature for the type of record (taxonomy, or document)
In [1]:
# LIBRARIES
debug = 1
import pandas as pd
if debug: print "pandas version: " + pd.__version__ # pandas version: 0.14.1
from pandas import DataFrame, Series
import numpy as np
if debug: print "numpy version: " + np.__version__ # numpy version: 1.8.1
from numpy import random # random
from re import split
import sklearn as skl
from sklearn import preprocessing, svm, cross_validation # labelEncoder
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
import sklearn.feature_extraction.text
from sklearn.externals import joblib
if debug: print "sklearn version: " + sklearn.__version__ # sklearn version: 0.15.0
import gc
In [2]:
# GLOBALS
from datetime import datetime
from time import time
dstamp=datetime.now().strftime("%y%m%d")
if debug: print "dstamp=" + dstamp # dstamp=141113
#dstamp="141106"
tstamp=datetime.now().strftime("%y%m%d%H%M%S")
if debug: print "tstamp=" + tstamp # tstamp=141113032844
dataDir = "../data/"
modelsDir = "models/"
dictionaryFile="PC_BN_PF_terms.141107b.tsv"
#dictionaryFile="PC_BN_PF_BPC_terms.141208.tsv"
corpusBasedUnigramVectorizerFilename = "corpusBasedUnigramVectorizer." + dstamp + ".sklearn"
corpusBasedUnigramVectorizerFile = modelsDir + corpusBasedUnigramVectorizerFilename
dictPopularPairsFilename = "dictPopularPairs." + dstamp + ".tsv"
dictPopularPairsFile = modelsDir + dictPopularPairsFilename
dictBasedUnigramVectorizerListFilename = "dictBasedUnigramVectorizerList." + dstamp + ".sklearn"
dictBasedUnigramVectorizerListFile = modelsDir + dictBasedUnigramVectorizerListFilename
modelFilename="svm_clfr_all." + dstamp + ".sklearn"
modelFile=modelsDir + modelFilename
colNameRecordType = 'recordType'
colNameRecordTextContent = 'recordTextContent'
colNameRecordLabel = 'recordLabel'
colNameRecordSource = 'recordSource'
colNameTermType = 'type'
colNameTermCategory = 'category'
colNameRecordTextContentTokens = 'tokens'
colNameRecordTextContentTokensCount = 'tokensCount'
In [3]:
#TRAIN CONFIGURATION
# min number of dict terms required for a feature-pair (e.g. FS/PC)
minDictionaryTermsFilter=3000
if debug>=2: minDictionaryTermsFilter=1
# min number of samples per label
minRowsPerLabel = 4
if debug>=2: minRowsPerLabel = 1
taxoDataCols=['taxoPath','dataFilename']
df_lbldDataInfo = pd.DataFrame([
{ 'colId':0, colNameRecordSource : 'VIG', colNameRecordType: 'taxoPath', 'dataFilename': 'pcTaxo_lbld2_VIG.141114.tsv'},
{ 'colId':1, colNameRecordSource : 'EB', colNameRecordType: 'taxoPath', 'dataFilename': 'pcTaxo_lbld2_EB.141030b.tsv'},
{ 'colId':2, colNameRecordSource : 'PG', colNameRecordType: 'taxoPath', 'dataFilename': 'pcTaxo_lbld2_PG.141030b.tsv'},
{ 'colId':3, colNameRecordSource : 'SHZ', colNameRecordType: 'taxoPath', 'dataFilename': 'pcTaxo_lbld2_SHZ.141030b.tsv'},
{ 'colId':4, colNameRecordSource : 'CJ', colNameRecordType: 'taxoPath', 'dataFilename': 'pcTaxo_lbld2_CJ.141114.tsv'},
{ 'colId':5, colNameRecordSource : 'CPROD_sml', colNameRecordType: 'passage', 'dataFilename': 'text_lbld2_CPROD1_0-40.141030.tsv'},
{ 'colId':6, colNameRecordSource : 'CPROD_med', colNameRecordType: 'passage', 'dataFilename': 'text_lbld2_CPROD1_170-230.141030.tsv'},
{ 'colId':7, colNameRecordSource : 'all', colNameRecordType: 'mixed', 'dataFilename': 'pcTaxo_labeled.150107b.tsv'},
])
df_lbldDataInfo.set_index('colId', inplace=True)
filesToProcess = pd.Series([7]) ;
if debug:
print df_lbldDataInfo
for fileId in filesToProcess:
record = df_lbldDataInfo.loc[fileId]; print record
testLabeledData = [
{'col1':'textItem', 'col2':"BLACK DENON DL-77", 'col3':"CE>HW", 'col4':'TEST'},
{'col1':'textItem', 'col2':"vw beetle x76 cab", 'col3':"AU>AU", 'col4':'TEST'},
{'col1':'taxoPath', 'col2':"Computing > Computing Hardware & Accessories", 'col3':"CT>HW", 'col4':'TEST'},
{'col1':'taxoPath', 'col2':"Appliances > Air Cleaners, Heating & Cooling > Household Fans", 'col3':"HG>AP", 'col4':'TEST'},
{'col1':'taxoPath', 'col2':"No separator", 'col3':"all", 'col4':'TEST'},
]
testDictionaryData = [
{'term':"denon", 'category':"CE", 'type':"BN"},
{'term':"dl-77", 'category':"CE", 'type':"P"},
{'term':"computing hardware", 'category':"CT>HW", 'type':"PC"},
{'term':"appliance", 'category':"PC", 'type':"HG>AP"},
{'term':"red", 'category':"PF", 'type':"OT"},
# {'term':"", 'category':"", 'type':""},
]
# one day we'll need the canonical taxonoymy
# srs_masterTaxoDataInfo = Series(["VIG","prodtaxo_lbld_VIG.141030.tsv.gz"], index=taxoDataCols)
Read-in the data¶
ASSUMPTIONS
- labeled data files are tab separated, and have column headers: taxoPath and taxoLabel
TO DO
- Error check: file does not exist or contain only a few lines
- Error check: file does not contain a (correct) header
Read-in the labeled data¶
In [4]:
df_labeledData = DataFrame()
# iterate through each file and append contents to df_labeledTaxonomies
# cols_to_keep = ['taxoPath','taxoLabel']
cols_to_keep = ['recordTextContent','recordLabel']
def getLabeledData(dataFilename, dataSource, debug):
# assumes: columns not empty
# assumes: column headers are present
df_tmp = pd.read_csv(dataDir + dataFilename, delimiter='\t', skipinitialspace=True)[cols_to_keep]
df_tmp.columns = [colNameRecordTextContent,colNameRecordLabel]
df_tmp[colNameRecordSource]=dataSource
if debug>=3: print "--------------------\dataSource =", dataSource, " rows =", len(df_tmp), " sample\n", df_tmp.loc[random.choice(df_tmp.index, 3, replace=False)]
return(df_tmp)
# iterate through the several data files
for fileId in filesToProcess:
record = df_lbldDataInfo.loc[fileId]
dataSource = record[colNameRecordSource]
recordType = record[colNameRecordType]
dataFilename = record['dataFilename']
df_labeledData = pd.concat ([df_labeledData, getLabeledData(dataFilename, dataSource, debug)])
#df_tmp = pd.concat ([df_labeledData, getLabeledData(dataFilename, dataSource, debug)]); df_labeledData = df_tmp; del df_tmp
# randomize it (and reindex it)
start_index_key = 0
df_labeledData.index = random.permutation(range(start_index_key, len(df_labeledData) + start_index_key))
df_labeledData.sort_index(inplace=True)
# TO DEBUG HARDCODE SOME LABELED RECORDS
if debug>=2:
df_labeledData = pd.DataFrame(testLabeledData)
df_labeledData.columns = [colNameRecordType, colNameRecordTextContent, colNameRecordLabel, colNameRecordSource]
if debug:
print "Labeled data (raw):", df_labeledData.shape #e.g. (33886, 3)
print "\nSample data:"; print df_labeledData.loc[random.choice(df_labeledData.index, 5, replace=False)]
df_labeledData.to_csv("df_labeledData." + dstamp + ".csv")
df_masterData = df_labeledData[(df_labeledData.recordSource == 'VIG')]
# DEBUG: report the distribution by dataSource
print "\nDistribution by dataSource:";
print df_labeledData.groupby([colNameRecordSource]).count().sort(colNameRecordLabel, ascending=False)
print "\nDistribution by category:";
print df_labeledData.groupby([colNameRecordLabel]).count().sort(colNameRecordSource, ascending=False)
Read-in the dictionary¶
In [5]:
cols_to_keep = ['term','category','type']
df_dictionary=pd.read_csv(dataDir + dictionaryFile, delimiter='\t', skipinitialspace=True)[cols_to_keep]
if debug>=2:
df_dictionary = pd.DataFrame(testDictionaryData)
df_dictionary.columns = cols_to_keep
if debug: print "Sample of dictionary records" ; print df_dictionary.loc[random.choice(df_dictionary.index, 5, replace=False)]
Clean the data¶
In [6]:
# REMOVE LABELED RECORDS WITH INFREQUENT LABELS (i.e. to only train a model on labels with some supproting evidence)
g = df_labeledData.groupby([colNameRecordLabel])
df_labeledData = g.filter(lambda x: x[colNameRecordTextContent].count() >= minRowsPerLabel)
df_labeledData[colNameRecordTextContent] = df_labeledData[colNameRecordTextContent].str.lower()
df_labeledData.index = range(0, len(df_labeledData))
if len(df_labeledData.index)<2: print "ERROR!!!!!!!!!!!!!!!!!!!!!"
elif debug:
print "Labeled data (filtered):", df_labeledData.shape #e.g. (33834, 3)
print df_labeledData.groupby([colNameRecordLabel]).count().sort(colNameRecordSource, ascending=False).head(5)
print "..."
if debug==1: print df_labeledData.groupby([colNameRecordLabel]).count().sort(colNameRecordSource, ascending=False).tail(5)
df_labeledData.to_csv("df_labeledData_filtered." + dstamp + ".csv")
# report on a single record that we can follow through
elif debug>=3:
#stringToMatch="irconia"
#testRecordId=df_labeledTaxonomies.loc[df_labeledData['taxoPath'].str.contains(stringToMatch)].index[0]
testRecordId=random.choice(df_labeledData.index, 1, replace=False)[0]
print "testRecordId=", testRecordId
print df_labeledData.loc[testRecordId]
In [7]:
# IN THE DICTIONARY IDENTIFY THE POPULAR category/type PAIRS
df=DataFrame({'termCount' : df_dictionary.groupby(["category","type"]).size()}).reset_index()
df_popularCategoryTypes = df[(df.termCount >= minDictionaryTermsFilter)].reset_index()
df_popularCategoryTypes = df_popularCategoryTypes.sort(['category','type'])
del df # todo: avoid the temp data structure
df_popularCategoryTypes.to_csv(dictPopularPairsFile, sep='\t')
if debug>=1:
print "number of popular pairs:", len(df_popularCategoryTypes.index) # e.g. 13
#for index, row in df_popularCategoryTypes.sort(['category','type']).iterrows():
for index, row in df_popularCategoryTypes.iterrows():
print "index:", index, " category:", row['category'], " type:", row['type'], " termCount:", row['termCount']
FEATURE EXTRACTION¶
Create the feature vector¶
In [8]:
# shell array with x records
df_extrContentFeatures = pd.DataFrame(np.empty((len(df_labeledData.index),0)))
Create the dictionary-based featurizer (a list of them for popular pairs)¶
In [9]:
list_cntVectorizer = []
# CREATE A LIST OF TRAINED CounterVectorizers ITERATIVELY
#for index, row in df_popularCategoryTypes.sort(['category','type']).iterrows():
# productType = row['type']
# productCategory = row['category']
for index, record in df_popularCategoryTypes.iterrows():
productType, productCategory = record[colNameTermType], record[colNameTermCategory]
df_TypeCategory_terms = pd.DataFrame(df_dictionary[(df_dictionary.category==productCategory) & (df_dictionary.type==productType)]['term'].str.lower())
if debug>=1:
print "index:", index, " productType:", record['type'], " productCategory:", record['category']
print df_TypeCategory_terms.shape
if debug>=3: # report the counts
print df_TypeCategory_terms.tail(3)
print df_TypeCategory_terms[df_TypeCategory_terms['term'].str.contains("zener")]
srs_termCount = df_TypeCategory_terms.term.apply(lambda x: pd.value_counts(x.lower().split(' '))).sum(axis = 0)
print srs_termCount.head(5)
print "\n"
cntVectorizer = sklearn.feature_extraction.text.CountVectorizer(min_df=1) #, decode_error="ignore")
cntVectorizer.fit(df_TypeCategory_terms['term'])
list_cntVectorizer.append(cntVectorizer)
# WRITE-OUT THE DICTIONARY-BASED FEATURIZATION MODEL(s)
joblib.dump(list_cntVectorizer, dictBasedUnigramVectorizerListFile)
if debug>=3: list_cntVectorizer
Features based on content string and dictionary¶
In [10]:
df_extrContentFeatures['strLen'] = df_labeledData[colNameRecordTextContent].str.len()
# Extract derived information
df_labeledDataDerived = pd.DataFrame(np.empty((len(df_labeledData.index),0))) # shell array
df_labeledDataDerived['pathNodes'] = df_labeledData.recordTextContent.apply(lambda s: s.split(' > '))
df_labeledDataDerived[colNameRecordTextContentTokens] = df_labeledData.recordTextContent.apply(lambda s: split('[>| |&]+',s))
if debug>=2: print "derived data:", df_labeledDataDerived.loc[random.choice(df_labeledDataDerived.index, 5, replace=False)]
df_extrContentFeatures[colNameRecordTextContentTokensCount] = df_labeledDataDerived[colNameRecordTextContentTokens].str.len()
# error check
if len(df_extrContentFeatures.index)<2: print "ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
# report a sample
elif debug: print df_extrContentFeatures.loc[random.choice(df_extrContentFeatures.index, 2, replace=True)]
In [11]:
# ITERATIVELY TEST EACH TERM AGAINST THE TRAINED CountVectorizers
for index, record in df_popularCategoryTypes.iterrows(): # ordered as a debugging aid
productType, productCategory = record[colNameTermType], record[colNameTermCategory]
cntVectorizer = list_cntVectorizer[index]
cv_list_pre = cntVectorizer.transform(df_labeledData[colNameRecordTextContent])
cv_list = cv_list_pre.toarray().sum(axis=1).tolist()
df_extrContentFeatures[productType+"_"+productCategory+"_terms"] = cv_list
df_extrContentFeatures[productType+"_"+productCategory+"_terms2tokens"] = cv_list / df_extrContentFeatures[colNameRecordTextContentTokensCount]
if debug>=1:
print "index:", index, " productType:", record['type'], " productCategory:", record['category']
if debug>=3:
print cntVectorizer.transform(df_testTextItem['textItem']).toarray()
print cv_list
print "\n"
if debug: df_extrContentFeatures.to_csv("df_extrContentFeatures." + dstamp + ".csv")
# confirm that the [[vector]]s have not been jumbled on the record selected earlier/above
#print df_indepFeatures.loc[testRecordId]
# report a sample
if len(df_extrContentFeatures.index)<2: print "ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
# report a sample
elif debug: print df_extrContentFeatures.loc[random.choice(df_extrContentFeatures.index, 2, replace=True)]
In [12]:
for index, row in df_popularCategoryTypes.sort(['category','type']).iterrows():
productType = row['type']
productCategory = row['category']
cntVectorizer = list_cntVectorizer[index]
#cv_sumList = cntVectorizer.transform(df_prepData['pathTokens']).toarray().sum(axis=1).tolist()
#df_extrContentFeatures [productType+"_"+productCategory+"_terms"] = cv_sumList
#df_extrContentFeatures [productType+"_"+productCategory+"_terms2tokens"] = cv_sumList / df_testTextItems ['tokens']
cv_sumList = cntVectorizer.transform(df_labeledData[colNameRecordTextContent]).toarray().sum(axis=1).tolist()
#cv_sumList = cntVectorizer.transform(df_testTextItems ['textItem']).toarray().sum(axis=1).tolist()
#df_testTextItems [productType+"_"+productCategory+"_terms"] = cv_sumList
#df_testTextItems [productType+"_"+productCategory+"_terms2tokens"] = cv_sumList / df_testTextItems ['tokens']
if debug>=2:
print "index:", index, " productType:", row['type'], " productCategory:", row['category']
if debug>=3:
print cntVectorizer.transform(df_testTextItem['textItem']).toarray()
print cv_sumList
print "\n"
Features based on corpus text token matches¶
Notes:
- For now just unigrams.
In [13]:
corpusBasedUnigramVectorizer = CountVectorizer(min_df=1, decode_error="ignore")
srs_textCorpus = df_labeledData[colNameRecordTextContent]
corpusBasedUnigramVectorizer.fit(srs_textCorpus)
# save this feature extractor
joblib.dump(corpusBasedUnigramVectorizer, corpusBasedUnigramVectorizerFile)
Out[13]:
In [14]:
# apply this feature extractor
sprs_vectorizedTokens = corpusBasedUnigramVectorizer.transform(srs_textCorpus)
narr_vectorizedTokens = sprs_vectorizedTokens.toarray()
df_vectorizedTokens = pd.DataFrame(narr_vectorizedTokens)
if debug: print "DEBUG: n_samples: %d, n_features: %d\n" % sprs_vectorizedTokens.shape # n_samples: 18257, n_features: 8284
In [15]:
# for debugging keep the feature names
tokenDict = corpusBasedUnigramVectorizer.get_feature_names()
label_prefix="ut"
unigramTokenFeatureNames=[label_prefix + "_" + str(i) for i in range(df_vectorizedTokens.shape[1])]
df_vectorizedTokens.columns = unigramTokenFeatureNames
if debug>=1:
# report a sample
print df_vectorizedTokens.loc[random.choice(df_vectorizedTokens.index, 3, replace=False)]
# manually review the test record
#df_vectorizedTokens.loc[testRecordId]
if debug>=2:
print tokenDict # [u'00', u'08', u'09', u'10', u'100', u'1000', u'1001', ..., u'0042g3c1a41e', ..., u'146qqsspagenamezwdvwqqrdz1qqcmdzviewitem',
In [16]:
# DEBUG: relabel the column names to be user friendly
if debug>=2:
#unigramTokenFeatureNames=[label_prefix+"_"+str(i) for i in range(df_vectorizedTokens.shape[1])]
#unigramTokenFeatureNames=[label_prefix + "_" + tokenDict[i].decode('utf-8') for i in range(df_vectorizedTokens.shape[1])]
unigramTokenFeatureNames=[label_prefix + "_" + tokenDict[i] for i in range(df_vectorizedTokens.shape[1])]
df_vectorizedTokens.columns = unigramTokenFeatureNames
# report a sample
df_vectorizedTokens.loc[random.choice(df_vectorizedTokens.index, 3, replace=False)]
Merge all the separate feature spaces¶
In [17]:
# fyi, using .join instead appears to be memory intensive than using merge()
df_extrFeatures = pd.merge(df_extrContentFeatures, df_vectorizedTokens, how='inner', left_index=True, right_index=True, sort=True,
suffixes=('_x', '_y'), copy=True)
#df_extrFeatures = df_extrContentFeatures
#df_extrFeatures = df_vectorizedTokens
# clean-up
if not debug: del df_extrContentFeatures, df_vectorizedTokens
In [25]:
# DEBUG
if debug>=1:
print "shape: ", df_extrFeatures.shape # (16000, 7706) # (35163, 13503)
# report a sample
print df_extrFeatures.loc[random.choice(df_extrFeatures.index, 3, replace=False)]
# write-out to a file
if debug>=3:
#df_extrFeatures.to_csv("df_extrFeatures." + dstamp + ".tsv", sep='\t', encoding='utf-8')
df_extrFeatures.to_csv("df_extrFeatures." + dstamp + ".csv", encoding='latin1')
Prepare the data to meet the scikit-learn's requirements¶
mainly create separate structures for the target data and the feature data¶
Prepare the target variable¶
In [26]:
ndarr_labelData=df_labeledData[colNameRecordLabel] # array(['AE>VI', 'AE>MU', 'AE>AR', ...
if debug>=0:
ndarr_classes = pd.unique(df_labeledData[colNameRecordLabel]) # array(['AE>VI', 'AE>MU', 'AE>AR', ...
print ndarr_classes
targetEncoder = preprocessing.LabelEncoder()
targetEncoder.fit(ndarr_classes)
Out[26]:
prepare the source specific versions (for testing)
In [27]:
if debug>=4:
ndarr_labelData_EB=df_extrFeatures[(df_labeledData.taxoCode=='EB')][colNameRecordLabel] # array(['AE>VI', 'AE>MU', 'AE>AR', ...
#ndarr_labelData_EB
print len(ndarr_labelData_EB) # 21688
ndarr_labelData_SHZ=df_extrFeatures[(df_labeledData.taxoCode=='SHZ')][colNameRecordLabel] # array(['AE>VI', 'AE>MU', 'AE>AR', ...
#ndarr_labelData_SHZ
print len(ndarr_labelData_SHZ) # 719
Separate the feature data¶
In [28]:
cols2excludeMc = ['catid','taxoLabel','pathTokens','taxoCode','taxoPath','pathNodes']
# print "cols2excludeMc: ", cols2excludeMc # ['label01', 'vigPathLabel', ...
cols2retainMc = [x for x in df_extrFeatures.columns if x not in cols2excludeMc]
#print "cols2retainMc: ", cols2retainMc
# cols2retainMc ['vigPathLabel', 'newStrLen', 'vigStrLen', 'newPathNodesCount', 'vigPathNodesLen', 'newPathTokenCount', 'vigPathTokenCount', 'levenshN2V', 'jaccardN2V']
#df_featuresTadf_featuresTablebleMc=df_extrFeatures [cols2retainMc][(df_extrFeatures.vigPathLabel=='MP>AT')]
df_featuresTable = df_extrFeatures[cols2retainMc]
#ndarr_featuresTable=df_featuresTable.values
# report a sample
#df_featuresTable.loc[random.choice(df_featuresTable.index, 5, replace=False)]
#df_featuresTable.loc[15238]
prepare the source specific versions (for testing)
Train Model(s)¶
In [22]:
#svm_clfr_all = svm.SVC(gamma=0.001, C=100., probability=True)
#svm_clfr_all = svm.SVC(kernel='linear', gamma=0.001, C=100.)
svm_clfr_all = svm.LinearSVC(loss='l2', penalty="l2", dual=False, tol=1e-3) # train time: 7.768s
t0 = time()
svm_clfr_all.fit(df_featuresTable, ndarr_labelData)
train_time = time() - t0
print("train time: %0.3fs" % train_time) # rain time: 71s
Write Out Model¶
In [23]:
modelFilename = "svm_clfr_all." + dstamp + ".sklearn"
modelFile = modelsDir + modelFilename
joblib.dump(svm_clfr_all, modelFile)
Out[23]:
models for testing¶
In [29]:
if debug>=4:
#svm_clfr_SHZ = svm.SVC(gamma=0.001, C=100., probability=True)
svm_clfr_SHZ = svm.LinearSVC(loss='l2', penalty="l2", dual=False, tol=1e-3) # train time: 1.056s
t0 = time()
svm_clfr_SHZ.fit(df_featuresTable_SHZ, ndarr_labelData_SHZ)
train_time = time() - t0
print("train time: %0.3fs" % train_time)
#svm_clfr_EB = svm.SVC(gamma=0.001, C=100., probability=True)
svm_clfr_EB = svm.LinearSVC(loss='l2', penalty="l2", dual=False, tol=1e-3) # train time: 5.330s
t0 = time()
svm_clfr_EB.fit(df_featuresTable_EB, ndarr_labelData_EB)
train_time = time() - t0
print("train time: %0.3fs" % train_time)
Test Model(s)¶
TO DO
- stratified tests (e.g. keep one source out, keep one source+category out)
In [30]:
ndarr_preds_All_All = svm_clfr_all.predict(df_featuresTable)
# note, accuracy_score requires numerized target labels
print "accuracy_score=", accuracy_score(targetEncoder.transform(ndarr_labelData), targetEncoder.transform(ndarr_preds_All_All))
# in sample 0.7109 # 0.9201 # 0.9261 (linear) # 0.997
Simple 'stratification'¶
cross-validation¶
unfair test - information leakage
In [31]:
# ideally should reuse exact settings from above
clf = svm.LinearSVC(loss='l2', penalty="l2", dual=False, tol=1e-3) # train time: 7.768s
t0 = time()
scores = cross_validation.cross_val_score(clf, df_featuresTable, targetEncoder.transform(ndarr_labelData), cv=5)
eval_time = time() - t0
print("eval time: %0.3fs" % eval_time) # eval time:
print "scores =" ,scores
print "mean =", scores.mean(), " std =", scores.std()
# scores = [ 0.83715287 0.95904996 0.97941822 0.96415771 0.89734366] # old # 5CVmean = 0.92742448436 std = 0.053116228864
# scores = [ 0.97379232 0.97584884 0.97218656 0.97384967 0.97607107] # 141118 all features
# scores = [ 0.77192237 0.76956954 0.77421195 0.76821949 0.7717438] # 141118 df_extrContentFeatures
# scores = [ 0.97039241 0.9731496 0.97133077 0.9718491 0.97277547] # mean = 0.971899467383 std = 0.000992712274548
In [26]:
In [26]:
In [26]:
In [26]:
In [ ]: