Gabor Melli's v2nodeClassify.TrainEval.150107

From GM-RKB
Jump to navigation Jump to search

Gabor_Melli's_v2nodeClassify.TrainEval.150107 is a supervised prediction of product taxonomy node system.



References

2015

v2nodeClassify.TrainEval.150107

TRAIN (AND EVALUATE) A TAXO-NODE CLASSIFICATION MODEL

This notebook represents a supervised product taxonomy classification system (that is based on a one-vs-rest supervised classification algorithm )

To Do

  • Add text preprocessing
  • Make featurization a subroutine
  • Include a feature for the type of record (taxonomy, or document)
In [1]:
# LIBRARIES
debug = 1

import pandas as pd
if debug: print "pandas version: " + pd.__version__   # pandas version: 0.14.1
from pandas import DataFrame, Series

import numpy as np
if debug: print "numpy version: " + np.__version__    # numpy version: 1.8.1
from numpy  import random # random

from re     import split

import sklearn as skl
from sklearn import preprocessing, svm, cross_validation  # labelEncoder
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
import sklearn.feature_extraction.text 
from sklearn.externals import joblib
if debug: print "sklearn version: " + sklearn.__version__    # sklearn version: 0.15.0

import gc
pandas version: 0.14.1
numpy version: 1.8.1
sklearn version: 0.15.0
In [2]:
# GLOBALS

from datetime import datetime
from time import time
dstamp=datetime.now().strftime("%y%m%d")
if debug: print "dstamp=" + dstamp # dstamp=141113
#dstamp="141106"
tstamp=datetime.now().strftime("%y%m%d%H%M%S")
if debug: print "tstamp=" + tstamp # tstamp=141113032844

dataDir = "../data/"
modelsDir = "models/"

dictionaryFile="PC_BN_PF_terms.141107b.tsv"
#dictionaryFile="PC_BN_PF_BPC_terms.141208.tsv"

corpusBasedUnigramVectorizerFilename = "corpusBasedUnigramVectorizer." + dstamp + ".sklearn"
corpusBasedUnigramVectorizerFile = modelsDir + corpusBasedUnigramVectorizerFilename

dictPopularPairsFilename = "dictPopularPairs." + dstamp + ".tsv"
dictPopularPairsFile = modelsDir + dictPopularPairsFilename

dictBasedUnigramVectorizerListFilename = "dictBasedUnigramVectorizerList." + dstamp + ".sklearn"
dictBasedUnigramVectorizerListFile = modelsDir + dictBasedUnigramVectorizerListFilename

modelFilename="svm_clfr_all." + dstamp + ".sklearn"
modelFile=modelsDir + modelFilename

colNameRecordType        = 'recordType'
colNameRecordTextContent = 'recordTextContent'
colNameRecordLabel       = 'recordLabel'
colNameRecordSource      = 'recordSource'
colNameTermType          = 'type'
colNameTermCategory                 = 'category'
colNameRecordTextContentTokens      = 'tokens'
colNameRecordTextContentTokensCount = 'tokensCount'
dstamp=150107
tstamp=150107180358
In [3]:
#TRAIN CONFIGURATION

# min number of dict terms required for a feature-pair (e.g. FS/PC)
minDictionaryTermsFilter=3000
if debug>=2: minDictionaryTermsFilter=1

# min number of samples per label
minRowsPerLabel = 4
if debug>=2: minRowsPerLabel = 1

taxoDataCols=['taxoPath','dataFilename']

df_lbldDataInfo = pd.DataFrame([
   { 'colId':0, colNameRecordSource : 'VIG',       colNameRecordType: 'taxoPath',  'dataFilename': 'pcTaxo_lbld2_VIG.141114.tsv'},
   { 'colId':1, colNameRecordSource : 'EB',        colNameRecordType: 'taxoPath',  'dataFilename': 'pcTaxo_lbld2_EB.141030b.tsv'},
   { 'colId':2, colNameRecordSource : 'PG',        colNameRecordType: 'taxoPath',  'dataFilename': 'pcTaxo_lbld2_PG.141030b.tsv'},
   { 'colId':3, colNameRecordSource : 'SHZ',       colNameRecordType: 'taxoPath',  'dataFilename': 'pcTaxo_lbld2_SHZ.141030b.tsv'},
   { 'colId':4, colNameRecordSource : 'CJ',        colNameRecordType: 'taxoPath',  'dataFilename': 'pcTaxo_lbld2_CJ.141114.tsv'},
   { 'colId':5, colNameRecordSource : 'CPROD_sml', colNameRecordType: 'passage',   'dataFilename': 'text_lbld2_CPROD1_0-40.141030.tsv'},
   { 'colId':6, colNameRecordSource : 'CPROD_med', colNameRecordType: 'passage',   'dataFilename': 'text_lbld2_CPROD1_170-230.141030.tsv'},
   { 'colId':7, colNameRecordSource : 'all',       colNameRecordType: 'mixed',     'dataFilename': 'pcTaxo_labeled.150107b.tsv'},
])
df_lbldDataInfo.set_index('colId', inplace=True)

filesToProcess = pd.Series([7]) ;

if debug:
    print df_lbldDataInfo
    for fileId in filesToProcess:
        record = df_lbldDataInfo.loc[fileId]; print record

testLabeledData = [
      {'col1':'textItem', 'col2':"BLACK DENON DL-77", 'col3':"CE>HW", 'col4':'TEST'},
      {'col1':'textItem', 'col2':"vw beetle x76 cab", 'col3':"AU>AU", 'col4':'TEST'},
      {'col1':'taxoPath', 'col2':"Computing > Computing Hardware & Accessories", 'col3':"CT>HW", 'col4':'TEST'},
      {'col1':'taxoPath', 'col2':"Appliances > Air Cleaners, Heating & Cooling > Household Fans", 'col3':"HG>AP", 'col4':'TEST'},
      {'col1':'taxoPath', 'col2':"No separator", 'col3':"all", 'col4':'TEST'},
   ]

testDictionaryData = [
      {'term':"denon", 'category':"CE", 'type':"BN"},
      {'term':"dl-77", 'category':"CE", 'type':"P"},
      {'term':"computing hardware", 'category':"CT>HW", 'type':"PC"},
      {'term':"appliance", 'category':"PC", 'type':"HG>AP"},
      {'term':"red", 'category':"PF", 'type':"OT"},
      # {'term':"", 'category':"", 'type':""},
   ]

# one day we'll need the canonical taxonoymy
# srs_masterTaxoDataInfo = Series(["VIG","prodtaxo_lbld_VIG.141030.tsv.gz"], index=taxoDataCols)
                               dataFilename recordSource recordType
colId      
0               pcTaxo_lbld2_VIG.141114.tsv          VIG   taxoPath
1               pcTaxo_lbld2_EB.141030b.tsv           EB   taxoPath
2               pcTaxo_lbld2_PG.141030b.tsv           PG   taxoPath
3              pcTaxo_lbld2_SHZ.141030b.tsv          SHZ   taxoPath
4                pcTaxo_lbld2_CJ.141114.tsv           CJ   taxoPath
5         text_lbld2_CPROD1_0-40.141030.tsv    CPROD_sml    passage
6      text_lbld2_CPROD1_170-230.141030.tsv    CPROD_med    passage
7                pcTaxo_labeled.150107b.tsv          all      mixed
dataFilename    pcTaxo_labeled.150107b.tsv
recordSource                           all
recordType                           mixed
Name: 7, dtype: object

Read-in the data

ASSUMPTIONS

  • labeled data files are tab separated, and have column headers: taxoPath and taxoLabel

TO DO

  • Error check: file does not exist or contain only a few lines
  • Error check: file does not contain a (correct) header

Read-in the labeled data

In [4]:
df_labeledData = DataFrame()

# iterate through each file and append contents to df_labeledTaxonomies

# cols_to_keep = ['taxoPath','taxoLabel']
cols_to_keep = ['recordTextContent','recordLabel']

def getLabeledData(dataFilename, dataSource, debug):
    # assumes: columns not empty
    # assumes: column headers are present
    df_tmp = pd.read_csv(dataDir + dataFilename, delimiter='\t', skipinitialspace=True)[cols_to_keep]
    df_tmp.columns = [colNameRecordTextContent,colNameRecordLabel]
    df_tmp[colNameRecordSource]=dataSource 
    if debug>=3: print "--------------------\dataSource =", dataSource, " rows =", len(df_tmp), " sample\n", df_tmp.loc[random.choice(df_tmp.index, 3, replace=False)]
    return(df_tmp)

# iterate through the several data files
for fileId in filesToProcess: 
    record = df_lbldDataInfo.loc[fileId]
    dataSource = record[colNameRecordSource]
    recordType = record[colNameRecordType]
    dataFilename = record['dataFilename']
    df_labeledData = pd.concat ([df_labeledData, getLabeledData(dataFilename, dataSource, debug)])
    #df_tmp = pd.concat ([df_labeledData, getLabeledData(dataFilename, dataSource, debug)]); df_labeledData = df_tmp; del df_tmp

# randomize it (and reindex it)
start_index_key = 0
df_labeledData.index = random.permutation(range(start_index_key, len(df_labeledData) + start_index_key))
df_labeledData.sort_index(inplace=True)

# TO DEBUG HARDCODE SOME LABELED RECORDS
if debug>=2:
   df_labeledData = pd.DataFrame(testLabeledData)
   df_labeledData.columns = [colNameRecordType, colNameRecordTextContent, colNameRecordLabel, colNameRecordSource]

if debug:
    print "Labeled data (raw):", df_labeledData.shape #e.g. (33886, 3)
    print "\nSample data:"; print df_labeledData.loc[random.choice(df_labeledData.index, 5, replace=False)]
    df_labeledData.to_csv("df_labeledData." + dstamp + ".csv")
    df_masterData = df_labeledData[(df_labeledData.recordSource == 'VIG')]

    # DEBUG: report the distribution by dataSource
    print "\nDistribution by dataSource:"; 
    print df_labeledData.groupby([colNameRecordSource]).count().sort(colNameRecordLabel, ascending=False)
    print "\nDistribution by category:"; 
    print df_labeledData.groupby([colNameRecordLabel]).count().sort(colNameRecordSource, ascending=False)
Labeled data (raw): (56417, 3)

Sample data:
                                       recordTextContent recordLabel  \
33915  Vehicles > CHEVROLET, Vehicles > OLDSMOBILE, V...       AU>PA 
4765   Vehicles > MERCURY, Parts > Categories > Brake...       AU>PA 
29832              Sporting Goods > Indoor Games > Darts       SF>OT 
55592  Clothing, Shoes & Accessories > Unisex Clothin...       FS>AC 
13556                                                NaN         NaN 

      recordSource
33915          all
4765           all
29832          all
55592          all
13556          all

Distribution by dataSource:
              recordTextContent  recordLabel
recordSource
all                       49951        49951

Distribution by category:
             recordTextContent  recordSource
recordLabel 
AU>PA                    13547         13547
CB>OT                     5627          5627
AE>VI                     2513          2513
HO>AP                     2143          2143
IS>IN                     1553          1553
BK>BK                     1444          1444
HO>TO                     1403          1403
SF>OT                     1348          1348
HG>HD                     1255          1255
FS>CL                      805           805
OT                         793           793
CT>HW                      777           777
FS>CL>WO                   766           766
HG>KD                      676           676
HO>CR                      634           634
JW>JE                      606           606
CE>HE                      541           541
HG>HI                      520           520
CE                         515           515
HG>GI                      484           484
CB>ST                      465           465
FH>FA                      456           456
FD>FD                      455           455
A                          378           378
HB>BE                      361           361
AU                         347           347
SF>BI                      338           338
SF>OS                      336           336
HG>GA                      322           322
CP>CA                      316           316
...                        …           ...
CP                           2             2
HB>OT                        2             2
AG>GA                        1             1
RT                           1             1
CT>HT                        1             1
CM>SW                        1             1
DT                           1             1
EM                           1             1
FB                           1             1
FD                           1             1
FH                           1             1
SF>CS                        1             1
HG>TO                        1             1
BL                           1             1
HG>GN                        1             1
SD>WI                        1             1
HG>GD                        1             1
CB>MO                        1             1
JW                           1             1
FS>AC>OT                     1             1
AG>OT                        1             1
CB                           1             1
KD>KN                        1             1
FS>LB>OT                     1             1
HG                           1             1
HO                           1             1
MM                           1             1
MP>AT                        1             1
MP                           1             1
FS>SH>OT                     1             1

[158 rows x 2 columns]

Read-in the dictionary

In [5]:
cols_to_keep = ['term','category','type']
df_dictionary=pd.read_csv(dataDir + dictionaryFile, delimiter='\t', skipinitialspace=True)[cols_to_keep]

if debug>=2:
   df_dictionary = pd.DataFrame(testDictionaryData)
   df_dictionary.columns = cols_to_keep

if debug: print "Sample of dictionary records" ; print df_dictionary.loc[random.choice(df_dictionary.index, 5, replace=False)]
Sample of dictionary records
                    term category type
105163      rubber bands       HO   PC
122661    high jump bars       SF   PC
85651             yellow       GM   PF
33652        nissan 1200       AU   PL
74057   kids dog costume       FS   PC

Clean the data

In [6]:
# REMOVE LABELED RECORDS WITH INFREQUENT LABELS (i.e. to only train a model on labels with some supproting evidence)

g = df_labeledData.groupby([colNameRecordLabel])
df_labeledData = g.filter(lambda x: x[colNameRecordTextContent].count() >= minRowsPerLabel)

df_labeledData[colNameRecordTextContent] = df_labeledData[colNameRecordTextContent].str.lower()

df_labeledData.index = range(0, len(df_labeledData))

if len(df_labeledData.index)<2: print "ERROR!!!!!!!!!!!!!!!!!!!!!"

elif debug:
    print "Labeled data (filtered):", df_labeledData.shape #e.g. (33834, 3)  
    print df_labeledData.groupby([colNameRecordLabel]).count().sort(colNameRecordSource, ascending=False).head(5)
    print "..."
    if debug==1: print df_labeledData.groupby([colNameRecordLabel]).count().sort(colNameRecordSource, ascending=False).tail(5)
    df_labeledData.to_csv("df_labeledData_filtered." + dstamp + ".csv")

# report on a single record that we can follow through
elif debug>=3:
    #stringToMatch="irconia"
    #testRecordId=df_labeledTaxonomies.loc[df_labeledData['taxoPath'].str.contains(stringToMatch)].index[0]
    testRecordId=random.choice(df_labeledData.index, 1, replace=False)[0]
    print "testRecordId=", testRecordId
    print df_labeledData.loc[testRecordId]
Labeled data (filtered): (49891, 3)
             recordTextContent  recordSource
recordLabel 
AU>PA                    13547         13547
CB>OT                     5627          5627
AE>VI                     2513          2513
HO>AP                     2143          2143
IS>IN                     1553          1553
...
             recordTextContent  recordSource
recordLabel 
BK>MN                        6             6
FS>OS                        6             6
SP                           6             6
FS                           5             5
IS>IS                        4             4
-c:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
In [7]:
# IN THE DICTIONARY IDENTIFY THE POPULAR category/type PAIRS

df=DataFrame({'termCount' : df_dictionary.groupby(["category","type"]).size()}).reset_index()
df_popularCategoryTypes = df[(df.termCount >= minDictionaryTermsFilter)].reset_index()
df_popularCategoryTypes = df_popularCategoryTypes.sort(['category','type'])
del df # todo: avoid the temp data structure

df_popularCategoryTypes.to_csv(dictPopularPairsFile, sep='\t') 

if debug>=1:
   print "number of popular pairs:", len(df_popularCategoryTypes.index) # e.g. 13
   #for index, row in df_popularCategoryTypes.sort(['category','type']).iterrows():
   for index, row in df_popularCategoryTypes.iterrows():
      print "index:", index, "  category:", row['category'], "  type:", row['type'], "  termCount:", row['termCount']
number of popular pairs: 13
index: 0   category: A   type: BN   termCount: 6314
index: 1   category: AE   type: BN   termCount: 9654
index: 2   category: AE   type: PL   termCount: 6398
index: 3   category: AU   type: BN   termCount: 3763
index: 4   category: AU   type: PC   termCount: 7594
index: 5   category: CE   type: PC   termCount: 3404
index: 6   category: CE   type: PF   termCount: 3347
index: 7   category: FS   type: BN   termCount: 5017
index: 8   category: FS   type: PC   termCount: 11948
index: 9   category: HG   type: BN   termCount: 3791
index: 10   category: HG   type: PC   termCount: 5532
index: 11   category: HG   type: PF   termCount: 3086
index: 12   category: OT   type: PF   termCount: 8838


FEATURE EXTRACTION


Create the feature vector

In [8]:
# shell array with x records
df_extrContentFeatures = pd.DataFrame(np.empty((len(df_labeledData.index),0)))
In [9]:
list_cntVectorizer = []

# CREATE A LIST OF TRAINED CounterVectorizers ITERATIVELY
#for index, row in df_popularCategoryTypes.sort(['category','type']).iterrows():
#    productType     = row['type']
#    productCategory = row['category']

for index, record in df_popularCategoryTypes.iterrows():
    productType, productCategory = record[colNameTermType], record[colNameTermCategory]

    df_TypeCategory_terms = pd.DataFrame(df_dictionary[(df_dictionary.category==productCategory) & (df_dictionary.type==productType)]['term'].str.lower())

    if debug>=1:
       print "index:", index, "  productType:", record['type'], "  productCategory:", record['category']
       print df_TypeCategory_terms.shape

    if debug>=3:  # report the counts
       print df_TypeCategory_terms.tail(3)
       print df_TypeCategory_terms[df_TypeCategory_terms['term'].str.contains("zener")]
       srs_termCount = df_TypeCategory_terms.term.apply(lambda x: pd.value_counts(x.lower().split(' '))).sum(axis = 0)
       print srs_termCount.head(5)
       print "\n"

    cntVectorizer = sklearn.feature_extraction.text.CountVectorizer(min_df=1) #, decode_error="ignore")
    cntVectorizer.fit(df_TypeCategory_terms['term'])
    list_cntVectorizer.append(cntVectorizer)

# WRITE-OUT THE DICTIONARY-BASED FEATURIZATION MODEL(s)
joblib.dump(list_cntVectorizer, dictBasedUnigramVectorizerListFile) 

if debug>=3:  list_cntVectorizer
index: 0   productType: BN   productCategory: A
(6314, 1)
index: 1   productType: BN   productCategory: AE
(9654, 1)
index: 2   productType: PL   productCategory: AE
(6398, 1)
index: 3   productType: BN   productCategory: AU
(3763, 1)
index: 4   productType: PC   productCategory: AU
(7594, 1)
index: 5   productType: PC   productCategory: CE
(3404, 1)
index: 6   productType: PF   productCategory: CE
(3347, 1)
index: 7   productType: BN   productCategory: FS
(5017, 1)
index: 8   productType: PC   productCategory: FS
(11948, 1)
index: 9   productType: BN   productCategory: HG
(3791, 1)
index: 10   productType: PC   productCategory: HG
(5532, 1)
index: 11   productType: PF   productCategory: HG
(3086, 1)
index: 12   productType: PF   productCategory: OT
(8838, 1)

Features based on content string and dictionary

In [10]:
df_extrContentFeatures['strLen']    = df_labeledData[colNameRecordTextContent].str.len()

# Extract derived information 
df_labeledDataDerived = pd.DataFrame(np.empty((len(df_labeledData.index),0))) # shell array
df_labeledDataDerived['pathNodes']  = df_labeledData.recordTextContent.apply(lambda s: s.split(' > '))
df_labeledDataDerived[colNameRecordTextContentTokens] = df_labeledData.recordTextContent.apply(lambda s: split('[>| |&]+',s))
if debug>=2: print "derived data:", df_labeledDataDerived.loc[random.choice(df_labeledDataDerived.index, 5, replace=False)]

df_extrContentFeatures[colNameRecordTextContentTokensCount] = df_labeledDataDerived[colNameRecordTextContentTokens].str.len()

# error check
if len(df_extrContentFeatures.index)<2: print "ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
# report a sample
elif debug: print df_extrContentFeatures.loc[random.choice(df_extrContentFeatures.index, 2, replace=True)]
       strLen  tokensCount
18015      48            6
5258       54            6
In [11]:
# ITERATIVELY TEST EACH TERM AGAINST THE TRAINED CountVectorizers

for index, record in df_popularCategoryTypes.iterrows():  # ordered as a debugging aid

    productType, productCategory = record[colNameTermType], record[colNameTermCategory]

    cntVectorizer = list_cntVectorizer[index] 
    cv_list_pre = cntVectorizer.transform(df_labeledData[colNameRecordTextContent])
    cv_list = cv_list_pre.toarray().sum(axis=1).tolist()

    df_extrContentFeatures[productType+"_"+productCategory+"_terms"] = cv_list
    df_extrContentFeatures[productType+"_"+productCategory+"_terms2tokens"] = cv_list / df_extrContentFeatures[colNameRecordTextContentTokensCount]

    if debug>=1:
       print "index:", index, "  productType:", record['type'], "  productCategory:", record['category']

    if debug>=3:
       print cntVectorizer.transform(df_testTextItem['textItem']).toarray()
       print cv_list
       print "\n"

 
if debug: df_extrContentFeatures.to_csv("df_extrContentFeatures." + dstamp + ".csv")
# confirm that the [[vector]]s have not been jumbled on the record selected earlier/above
#print df_indepFeatures.loc[testRecordId] 

# report a sample
if len(df_extrContentFeatures.index)<2: print "ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
# report a sample
elif debug: print df_extrContentFeatures.loc[random.choice(df_extrContentFeatures.index, 2, replace=True)]
index: 0   productType: BN   productCategory: A
index: 1   productType: BN   productCategory: AE
index: 2   productType: PL   productCategory: AE
index: 3   productType: BN   productCategory: AU
index: 4   productType: PC   productCategory: AU
index: 5   productType: PC   productCategory: CE
index: 6   productType: PF   productCategory: CE
index: 7   productType: BN   productCategory: FS
index: 8   productType: PC   productCategory: FS
index: 9   productType: BN   productCategory: HG
index: 10   productType: PC   productCategory: HG
index: 11   productType: PF   productCategory: HG
index: 12   productType: PF   productCategory: OT
       strLen  tokensCount  BN_A_terms  BN_A_terms2tokens  BN_AE_terms  \
42052      87            8           4                0.5            1 
19851      30            4           0                0.0            0 

       BN_AE_terms2tokens  PL_AE_terms  PL_AE_terms2tokens  BN_AU_terms  \
42052               0.125            1               0.125            0 
19851               0.000            0               0.000            0 

       BN_AU_terms2tokens         …          PC_FS_terms  \
42052                   0         …                    3 
19851                   0         …                    0 

       PC_FS_terms2tokens  BN_HG_terms  BN_HG_terms2tokens  PC_HG_terms  \
42052               0.375            3               0.375            2 
19851               0.000            1               0.250            2 

       PC_HG_terms2tokens  PF_HG_terms  PF_HG_terms2tokens  PF_OT_terms  \
42052                0.25            3               0.375            1 
19851                0.50            1               0.250            2 

       PF_OT_terms2tokens
42052               0.125
19851               0.500

[2 rows x 28 columns]
In [12]:
for index, row in df_popularCategoryTypes.sort(['category','type']).iterrows():
    productType     = row['type']
    productCategory = row['category']

    cntVectorizer = list_cntVectorizer[index] 

    #cv_sumList = cntVectorizer.transform(df_prepData['pathTokens']).toarray().sum(axis=1).tolist()
    #df_extrContentFeatures [productType+"_"+productCategory+"_terms"] = cv_sumList
    #df_extrContentFeatures [productType+"_"+productCategory+"_terms2tokens"] = cv_sumList / df_testTextItems ['tokens']

    cv_sumList = cntVectorizer.transform(df_labeledData[colNameRecordTextContent]).toarray().sum(axis=1).tolist()
    #cv_sumList = cntVectorizer.transform(df_testTextItems ['textItem']).toarray().sum(axis=1).tolist()
    #df_testTextItems [productType+"_"+productCategory+"_terms"] = cv_sumList
    #df_testTextItems [productType+"_"+productCategory+"_terms2tokens"] = cv_sumList / df_testTextItems ['tokens']

    if debug>=2:
       print "index:", index, "  productType:", row['type'], "  productCategory:", row['category']

    if debug>=3:
       print cntVectorizer.transform(df_testTextItem['textItem']).toarray()
       print cv_sumList
       print "\n"
 

Features based on corpus text token matches

Notes:

  • For now just unigrams.
In [13]:
corpusBasedUnigramVectorizer = CountVectorizer(min_df=1, decode_error="ignore")

srs_textCorpus = df_labeledData[colNameRecordTextContent]

corpusBasedUnigramVectorizer.fit(srs_textCorpus)
# save this feature extractor
joblib.dump(corpusBasedUnigramVectorizer, corpusBasedUnigramVectorizerFile) 
Out[13]:
['models/corpusBasedUnigramVectorizer.150107.sklearn']
In [14]:
# apply this feature extractor
sprs_vectorizedTokens = corpusBasedUnigramVectorizer.transform(srs_textCorpus)

narr_vectorizedTokens = sprs_vectorizedTokens.toarray()
df_vectorizedTokens = pd.DataFrame(narr_vectorizedTokens)

if debug: print "DEBUG: n_samples: %d, n_features: %d\n" % sprs_vectorizedTokens.shape # n_samples: 18257, n_features: 8284
DEBUG: n_samples: 49891, n_features: 20238

In [15]:
# for debugging keep the feature names
tokenDict = corpusBasedUnigramVectorizer.get_feature_names()

label_prefix="ut"
unigramTokenFeatureNames=[label_prefix + "_" + str(i)  for i in range(df_vectorizedTokens.shape[1])]
df_vectorizedTokens.columns = unigramTokenFeatureNames

if debug>=1:
    # report a sample
    print df_vectorizedTokens.loc[random.choice(df_vectorizedTokens.index, 3, replace=False)]
    # manually review the test record
    #df_vectorizedTokens.loc[testRecordId] 

if debug>=2:
    print tokenDict # [u'00', u'08', u'09', u'10', u'100', u'1000', u'1001', ..., u'0042g3c1a41e', ..., u'146qqsspagenamezwdvwqqrdz1qqcmdzviewitem',
       ut_0  ut_1  ut_2  ut_3  ut_4  ut_5  ut_6  ut_7  ut_8  ut_9  …   \
43969     0     0     0     0     0     0     0     0     0     0  …
5782      0     0     0     0     0     0     0     0     0     0  …
8256      0     0     0     0     0     0     0     0     0     0  …

       ut_20228  ut_20229  ut_20230  ut_20231  ut_20232  ut_20233  ut_20234  \
43969         0         0         0         0         0         0         0 
5782          0         0         0         0         0         0         0 
8256          0         0         0         0         0         0         0 

       ut_20235  ut_20236  ut_20237
43969         0         0         0
5782          0         0         0
8256          0         0         0

[3 rows x 20238 columns]
In [16]:
# DEBUG: relabel the column names to be user friendly
if debug>=2:
   #unigramTokenFeatureNames=[label_prefix+"_"+str(i) for i in range(df_vectorizedTokens.shape[1])]
   #unigramTokenFeatureNames=[label_prefix + "_" + tokenDict[i].decode('utf-8')  for i in range(df_vectorizedTokens.shape[1])]
   unigramTokenFeatureNames=[label_prefix + "_" + tokenDict[i]  for i in range(df_vectorizedTokens.shape[1])]
   df_vectorizedTokens.columns = unigramTokenFeatureNames

   # report a sample
   df_vectorizedTokens.loc[random.choice(df_vectorizedTokens.index, 3, replace=False)]
if debug>=2: print df_labeledData.loc[2:2]['taxoPath'] # confirm that the [[vector]]s have not been jumbled. df_vectorizedTokens.loc[testRecordId]['ut_asian'] # zirconia? # simple test to see the features at work unigramVectorizer.transform(['00 09 electronics']).toarray() # array([[1, 0, 1, ..., 0, 0, 0]], dtype=int64)

Merge all the separate feature spaces

In [17]:
# fyi, using .join instead appears to be memory intensive than using  merge()
df_extrFeatures = pd.merge(df_extrContentFeatures, df_vectorizedTokens, how='inner', left_index=True, right_index=True, sort=True,
      suffixes=('_x', '_y'), copy=True)

#df_extrFeatures = df_extrContentFeatures
#df_extrFeatures = df_vectorizedTokens

# clean-up
if not debug: del df_extrContentFeatures, df_vectorizedTokens
In [25]:
# DEBUG
if debug>=1:
   print "shape: ", df_extrFeatures.shape # (16000, 7706) # (35163, 13503)
   # report a sample
   print df_extrFeatures.loc[random.choice(df_extrFeatures.index, 3, replace=False)]
   # write-out to a file
if debug>=3:
   #df_extrFeatures.to_csv("df_extrFeatures." + dstamp + ".tsv", sep='\t', encoding='utf-8')
   df_extrFeatures.to_csv("df_extrFeatures." + dstamp + ".csv", encoding='latin1')
shape:  (49891, 20266)
       strLen  tokensCount  BN_A_terms  BN_A_terms2tokens  BN_AE_terms  \
45210      86           10           7           0.700000            0 
14851      52            6           1           0.166667            3 
26371     214           21           4           0.190476            2 

       BN_AE_terms2tokens  PL_AE_terms  PL_AE_terms2tokens  BN_AU_terms  \
45210            0.000000            4            0.400000            2 
14851            0.500000            4            0.666667            3 
26371            0.095238            2            0.095238           17 

       BN_AU_terms2tokens         …          ut_20228  ut_20229  ut_20230  \
45210            0.200000         …                 0         0         0 
14851            0.500000         …                 0         0         0 
26371            0.809524         …                 0         0         0 

       ut_20231  ut_20232  ut_20233  ut_20234  ut_20235  ut_20236  ut_20237
45210         0         0         0         0         0         0         0
14851         0         0         0         0         0         0         0
26371         0         0         0         0         0         0         0

[3 rows x 20266 columns]


Prepare the data to meet the scikit-learn's requirements

mainly create separate structures for the target data and the feature data

Prepare the target variable

In [26]:
ndarr_labelData=df_labeledData[colNameRecordLabel]  # array(['AE>VI', 'AE>MU', 'AE>AR', ...

if debug>=0:
    ndarr_classes = pd.unique(df_labeledData[colNameRecordLabel])  # array(['AE>VI', 'AE>MU', 'AE>AR', ...
    print ndarr_classes

targetEncoder = preprocessing.LabelEncoder()
targetEncoder.fit(ndarr_classes)
['MM>MI' 'AU' 'CE' 'AU>PA' 'CB>OT' 'AE>VI' 'HG>HD' 'FD>OT' 'CB>ST' 'FH>FA'
 'AU>VH' 'HG>GA' 'FS>LB' 'CE>OT' 'SF>OS' 'BK>BK' 'SF>GF' 'PT>FI' 'FD>FD'
 'SF>WI' 'ED' 'HO>AP' 'AU>EL' 'IS>IN' 'OT' 'AE>EV' 'SF>BI' 'HO>TO'
 'FS>CL>WO' 'FB>OT' 'FH>KN' 'A' 'FS>CL' 'HO>CR' 'SF>OT' 'FB>CL' 'FS>CL>KD'
 'HG>KD' 'FS>SH' 'CT>HW' 'HG>OT' 'HB>BE' 'CM>AC' 'CP>CA' 'IS>OS' 'FS>AC'
 'CE>HE' 'SF>WA' 'JW>JE' 'AU>OT' 'FS>CL>ME' 'RC' 'HG>HI' 'FS>OT' 'TV'
 'HB>HE' 'GM>HW' 'AE>AR' 'HO>GA' 'CT>OT' 'HG>GI' 'CB>SP' 'JW>WA' 'CT>SW'
 'GM>SW' 'HO>OT' 'MP>BO' 'MM>OT' 'SF>CA' 'HG>CS' 'AE>MU' 'FH>FN' 'MP>OT'
 'PT>DO' 'MM>AC' 'LF' 'CM>DV' 'FS>AC>ME' 'CM>OT' 'HO>HO' 'CE>PE' 'JW>OT'
 'FS>AC>WO' 'FS>SH>WO' 'GM>OT' 'FS>SH>ME' 'BK>MN' 'SH' 'HG>AP' 'FH>OT'
 'HG>FL' 'PT>CA' 'PT>BI' 'PT>OT' 'IS>OT' 'CM' 'FS>AC>KD' 'MP>MO' 'CT'
 'FS>SH>KD' 'CP>VI' 'CP>OT' 'FS>LB>WO' 'FS>OS' 'IS>OE' 'PT' 'IS>IS' 'BK>OT'
 'CA' 'AG>AD' 'FD>GO' 'HG>DE' 'SP' 'FH>SG' 'FS>CK' 'FS']
Out[26]:
LabelEncoder()

prepare the source specific versions (for testing)

In [27]:
if debug>=4:
    ndarr_labelData_EB=df_extrFeatures[(df_labeledData.taxoCode=='EB')][colNameRecordLabel]  # array(['AE>VI', 'AE>MU', 'AE>AR', ...
    #ndarr_labelData_EB
    print len(ndarr_labelData_EB) # 21688

    ndarr_labelData_SHZ=df_extrFeatures[(df_labeledData.taxoCode=='SHZ')][colNameRecordLabel]  # array(['AE>VI', 'AE>MU', 'AE>AR', ...
    #ndarr_labelData_SHZ
    print len(ndarr_labelData_SHZ) # 719

Separate the feature data

In [28]:
cols2excludeMc  = ['catid','taxoLabel','pathTokens','taxoCode','taxoPath','pathNodes']
# print "cols2excludeMc: ", cols2excludeMc # ['label01', 'vigPathLabel', ...

cols2retainMc  = [x for x in df_extrFeatures.columns if x not in cols2excludeMc]
#print "cols2retainMc: ", cols2retainMc
# cols2retainMc ['vigPathLabel', 'newStrLen', 'vigStrLen', 'newPathNodesCount', 'vigPathNodesLen', 'newPathTokenCount', 'vigPathTokenCount', 'levenshN2V', 'jaccardN2V']

#df_featuresTadf_featuresTablebleMc=df_extrFeatures [cols2retainMc][(df_extrFeatures.vigPathLabel=='MP>AT')]
df_featuresTable = df_extrFeatures[cols2retainMc]

#ndarr_featuresTable=df_featuresTable.values

# report a sample
#df_featuresTable.loc[random.choice(df_featuresTable.index, 5, replace=False)]
#df_featuresTable.loc[15238]

prepare the source specific versions (for testing)

if debug>=4: df_featuresTable_EB=df_extrFeatures [(df_labeledData.taxoCode=='EB')][cols2retainMc] #df_featuresTable_EB.loc[random.choice(df_featuresTable_EB.index, 5, replace=False)] print len(df_featuresTable_EB) # df_featuresTable_SHZ=df_extrFeatures [(df_labeledData.taxoCode=='SHZ')][cols2retainMc] #df_featuresTable_SHZ.loc[random.choice(df_featuresTable_SHZ.index, 5, replace=False)] print len(df_featuresTable_SHZ) #


Train Model(s)

In [22]:
#svm_clfr_all = svm.SVC(gamma=0.001, C=100., probability=True)
#svm_clfr_all = svm.SVC(kernel='linear', gamma=0.001, C=100.)
svm_clfr_all = svm.LinearSVC(loss='l2', penalty="l2", dual=False, tol=1e-3) # train time: 7.768s

t0 = time()
svm_clfr_all.fit(df_featuresTable, ndarr_labelData) 
train_time = time() - t0
print("train time: %0.3fs" % train_time) # rain time: 71s
train time: 237.171s

Write Out Model

In [23]:
modelFilename = "svm_clfr_all." + dstamp + ".sklearn"
modelFile = modelsDir + modelFilename

joblib.dump(svm_clfr_all, modelFile) 
Out[23]:
['models/svm_clfr_all.150107.sklearn',
 'models/svm_clfr_all.150107.sklearn_01.npy',
 'models/svm_clfr_all.150107.sklearn_02.npy',
 'models/svm_clfr_all.150107.sklearn_03.npy',
 'models/svm_clfr_all.150107.sklearn_04.npy',
 'models/svm_clfr_all.150107.sklearn_05.npy']

models for testing

In [29]:
if debug>=4:
    #svm_clfr_SHZ = svm.SVC(gamma=0.001, C=100., probability=True)
    svm_clfr_SHZ = svm.LinearSVC(loss='l2', penalty="l2", dual=False, tol=1e-3) # train time: 1.056s
    t0 = time()
    svm_clfr_SHZ.fit(df_featuresTable_SHZ, ndarr_labelData_SHZ) 
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    #svm_clfr_EB = svm.SVC(gamma=0.001, C=100., probability=True)
    svm_clfr_EB = svm.LinearSVC(loss='l2', penalty="l2", dual=False, tol=1e-3) # train time: 5.330s
    t0 = time()
    svm_clfr_EB.fit(df_featuresTable_EB, ndarr_labelData_EB) 
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)


Test Model(s)

TO DO

  • stratified tests (e.g. keep one source out, keep one source+category out)

In [30]:
ndarr_preds_All_All = svm_clfr_all.predict(df_featuresTable)

# note, accuracy_score requires numerized target labels
print "accuracy_score=", accuracy_score(targetEncoder.transform(ndarr_labelData), targetEncoder.transform(ndarr_preds_All_All))
# in sample 0.7109  # 0.9201  # 0.9261 (linear) # 0.997
accuracy_score= 0.994187328376

Simple 'stratification'

if debug>=2: ndarr_preds_EB_SHZ = svm_clfr_EB.predict(df_featuresTable_SHZ) accuracy_score(targetEncoder.transform(ndarr_labelData_SHZ), targetEncoder.transform(ndarr_preds_EB_SHZ) # 0.4881 0.2768 ndarr_preds_SHZ_EB = svm_clfr_SHZ.predict(df_featuresTable_EB) accuracy_score(targetEncoder.transform(ndarr_labelData_EB), targetEncoder.transform(ndarr_preds_SHZ_EB) # 0.08424 0.139

cross-validation

unfair test - information leakage

In [31]:
# ideally should reuse exact settings from above
clf = svm.LinearSVC(loss='l2', penalty="l2", dual=False, tol=1e-3) # train time: 7.768s

t0 = time()
scores = cross_validation.cross_val_score(clf, df_featuresTable, targetEncoder.transform(ndarr_labelData), cv=5)
eval_time = time() - t0
print("eval time: %0.3fs" % eval_time) # eval time:

print "scores =" ,scores
print "mean =", scores.mean(), " std =", scores.std()
# scores = [ 0.83715287  0.95904996  0.97941822  0.96415771  0.89734366] # old # 5CVmean = 0.92742448436  std = 0.053116228864
# scores = [ 0.97379232  0.97584884  0.97218656  0.97384967  0.97607107] # 141118 all features
# scores = [ 0.77192237  0.76956954  0.77421195  0.76821949  0.7717438] # 141118 df_extrContentFeatures
# scores = [ 0.97039241  0.9731496   0.97133077  0.9718491   0.97277547] # mean = 0.971899467383  std = 0.000992712274548
C:\Python27\lib\site-packages\sklearn\cross_validation.py:412: Warning: The least populated class in y has only 4 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=5.
  % (min_labels, self.n_folds)), Warning)
---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-31-51b2d72c4375> in <module>()
      3 
      4 t0 = time()
----> 5 scores = cross_validation.cross_val_score(clf, df_featuresTable, targetEncoder.transform(ndarr_labelData), cv=5)
      6 eval_time = time() - t0
      7 print("eval time: %0.3fs" % eval_time) # eval time:

C:\Python27\lib\site-packages\sklearn\cross_validation.pyc in cross_val_score(estimator, X, y, scoring, cv, n_jobs, verbose, fit_params, score_func, pre_dispatch)
   1149                                               train, test, verbose, None,
   1150                                               fit_params)
-> 1151                       for train, test in cv)
   1152     return np.array(scores)[:, 0]
   1153 

C:\Python27\lib\site-packages\sklearn\externals\joblib\parallel.pyc in __call__(self, iterable)
    642             self._iterating = True
    643             for function, args, kwargs in iterable:
--> 644                 self.dispatch(function, args, kwargs)
    645 
    646             if pre_dispatch == "all" or n_jobs == 1:

C:\Python27\lib\site-packages\sklearn\externals\joblib\parallel.pyc in dispatch(self, func, args, kwargs)
    389         """
    390         if self._pool is None:
--> 391             job = ImmediateApply(func, args, kwargs)
    392             index = len(self._jobs)
    393             if not _verbosity_filter(index, self.verbose):

C:\Python27\lib\site-packages\sklearn\externals\joblib\parallel.pyc in __init__(self, func, args, kwargs)
    127         # Don't delay the application, to avoid keeping the input
    128         # arguments in memory
--> 129         self.results = func(*args, **kwargs)
    130 
    131     def get(self):

C:\Python27\lib\site-packages\sklearn\cross_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters)
   1237         estimator.fit(X_train, **fit_params)
   1238     else:
-> 1239         estimator.fit(X_train, y_train, **fit_params)
   1240     test_score = _score(estimator, X_test, y_test, scorer)
   1241     if return_train_score:

C:\Python27\lib\site-packages\sklearn\svm\base.pyc in fit(self, X, y)
    677                              " one.")
    678 
--> 679         X = atleast2d_or_csr(X, dtype=np.float64, order="C")
    680 
    681         self.class_weight_ = compute_class_weight(self.class_weight,

C:\Python27\lib\site-packages\sklearn\utils\validation.pyc in atleast2d_or_csr(X, dtype, order, copy, force_all_finite)
    162     return _atleast2d_or_sparse(X, dtype, order, copy, sp.csr_matrix,
    163                                 "tocsr", sp.isspmatrix_csr,
--> 164                                 force_all_finite)
    165 
    166 

C:\Python27\lib\site-packages\sklearn\utils\validation.pyc in _atleast2d_or_sparse(X, dtype, order, copy, sparse_class, convmethod, check_same_type, force_all_finite)
    139     else:
    140         X = array2d(X, dtype=dtype, order=order, copy=copy,
--> 141                     force_all_finite=force_all_finite)
    142     return X
    143 

C:\Python27\lib\site-packages\sklearn\utils\validation.pyc in array2d(X, dtype, order, copy, force_all_finite)
    117         raise TypeError('A sparse matrix was passed, but dense data '
    118                         'is required. Use X.toarray() to convert to dense.')
--> 119     X_2d = np.asarray(np.atleast_2d(X), dtype=dtype, order=order)
    120     if force_all_finite:
    121         _assert_all_finite(X_2d)

C:\Python27\lib\site-packages\numpy\core\numeric.pyc in asarray(a, dtype, order)
    458 
    459     """
--> 460     return array(a, dtype, copy=False, order=order)
    461 
    462 def asanyarray(a, dtype=None, order=None):

MemoryError: 


In [26]:
 
In [26]:
 
In [26]:
 
In [26]:
 
In [ ]: