Gabor Melli's v2nodeClassify.Predict.150405
Jump to navigation
Jump to search
v2nodeClassify.Predict.150405
#predsFile="predsJoined." + taxoCode + "." + dstamp + ".tsv"
#df_predsJoined=pd.merge(df_predictions, df_VIG_lab, on='taxoLabel', suffixes=['_left', '_right'])
#df_predsJoined.to_csv(predsFile, sep='\t')
Gabor_Melli's_v2nodeClassify.Predict.150405 is a supervised prediction of product taxonomy node system.
References
2015
PREDICT TAXO-NODE OF UNLABELED DATA¶
This notebook represents a supervised product taxonomy classification system (that is based on a one-vs-rest supervised classification algorithm )
To Do
- Add text preprocessing
- Make featurization a subroutine
- Include a feature for the type of record (taxonomy, or document)
- Include dictionary-based features
In [1]:
# LIBRARIES
debug = 1
import pandas as pd
if debug: print "pandas version: " + pd.__version__ # pandas version: 0.15.2
from pandas import DataFrame, Series
import numpy as np
if debug: print "numpy version: " + np.__version__ # numpy version: 1.9.2
from numpy import random # random
from re import split
from sklearn import preprocessing, svm, cross_validation # labelEncoder
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
import sklearn.feature_extraction.text
from sklearn.externals import joblib
if debug: print "sklearn version: " + sklearn.__version__ # sklearn version: 0.15.2
import gc
In [2]:
# GLOBALS
from datetime import datetime
from time import time
dstamp=datetime.now().strftime("%y%m%d")
if debug: print "dstamp=" + dstamp # dstamp=141113
dstamp="150405"
tstamp=datetime.now().strftime("%y%m%d%H%M%S")
if debug: print "tstamp=" + tstamp # tstamp=141113032844
dataDir = "../data/"
modelsDir = "models/"
dictionaryFile="PC_BN_PF_terms.141107b.tsv"
corpusBasedUnigramVectorizerFilename = "corpusBasedUnigramVectorizer." + dstamp + ".sklearn"
corpusBasedUnigramVectorizerFile = modelsDir + corpusBasedUnigramVectorizerFilename
dictPopularPairsFilename = "dictPopularPairs." + dstamp + ".tsv"
dictPopularPairsFile = modelsDir + dictPopularPairsFilename
dictBasedUnigramVectorizerListFilename = "dictBasedUnigramVectorizerList." + dstamp + ".sklearn"
dictBasedUnigramVectorizerListFile = modelsDir + dictBasedUnigramVectorizerListFilename
modelFilename="svm_clfr_all." + dstamp + ".sklearn"
modelFile=modelsDir + modelFilename
colNameRecordType = 'recordType'
colNameRecordTextContent = 'recordTextContent'
colNameRecordTextContentOrig = 'recordTextContentOrig'
colNameRecordLabel = 'recordLabel'
colNameRecordSource = 'recordSource'
colNameTermType = 'type'
colNameTermCategory = 'category'
colNameRecordTextContentTokens = 'tokens'
colNameRecordTextContentTokensCount = 'tokensCount'
In [19]:
fileToProcess = 28 ;
df_unlbldTaxoDataInfo = pd.DataFrame([
{ 'fileId':0, 'colNameRecordSource': 'TST', 'colNameRecordType': 'taxoPath', 'dataFilename': 'test2.txt'},
{ 'fileId':1, 'colNameRecordSource': 'SHO', 'colNameRecordType': 'taxoPath', 'dataFilename': 'offerTaxo_unlbld_SHO.140515.tsv'},
{ 'fileId':2, 'colNameRecordSource': 'TRG', 'colNameRecordType': 'taxoPath', 'dataFilename': 'offerTaxo_unlabld_IMR-Target.141001.txt'},
{ 'fileId':3, 'colNameRecordSource': 'CJ', 'colNameRecordType': 'taxoPath', 'dataFilename': 'offerTaxo_unlabld_CJ.141001b.txt'},
{ 'fileId':4, 'colNameRecordSource': 'CPROD_sml', 'colNameRecordType': 'passage', 'dataFilename': 'CPROD1_unlbld_0-40.141030.tsv'},
{ 'fileId':5, 'colNameRecordSource': 'CPROD_med', 'colNameRecordType': 'passage', 'dataFilename': 'CPROD1_unlbld_170-230.141030.tsv'},
{ 'fileId':6, 'colNameRecordSource': 'dictTerms', 'colNameRecordType': 'term', 'dataFilename': 'PC_BN_PF_terms.141107.tsv', 'dataColName':'term'},
{ 'fileId':7, 'colNameRecordSource': 'dictTerms2', 'colNameRecordType': 'term', 'dataFilename': 'PC_BN_PF_BPC_terms.141208b.tsv', 'dataColName':'term'},
{ 'fileId':8, 'colNameRecordSource': 'terms2', 'colNameRecordType': 'term', 'dataFilename': 'toLabel.terms.150111b.tsv', 'dataColName':'term'},
{ 'fileId':9, 'colNameRecordSource': 'all', 'colNameRecordType': 'mixed', 'dataFilename': 'pcTaxo_labeled.150107b.tsv', 'dataColName':'recordTtextContent'},
{ 'fileId':10,'colNameRecordSource': 'EUS', 'colNameRecordType': 'taxoPath', 'dataFilename': 'pcTaxo_unlabld_EUS.150306.tsv'},
{ 'fileId':11,'colNameRecordSource': 'EUK', 'colNameRecordType': 'taxoPath', 'dataFilename': 'pcTaxo_unlabld_EUK.150306.tsv'},
{ 'fileId':12,'colNameRecordSource': 'EUSnotDelta','colNameRecordType': 'taxoPath', 'dataFilename': 'pcTaxo_unlabld_EUSnonDelta.150306.tsv'},
{ 'fileId':13,'colNameRecordSource': 'become', 'colNameRecordType': 'taxoPath', 'dataFilename': 'become_category_mappings.tsv', 'dataColName':'Feed Category'},
{ 'fileId':14,'colNameRecordSource': 'commission_junction_1','colNameRecordType': 'taxoPath', 'dataFilename': 'commission_junction_category_mappings.ascii.b.1of2.tsv', 'dataColName':'Feed Category'},
{ 'fileId':34,'colNameRecordSource': 'commission_junction_2','colNameRecordType': 'taxoPath', 'dataFilename': 'commission_junction_category_mappings.ascii.b.2of2.tsv', 'dataColName':'Feed Category'},
{ 'fileId':15,'colNameRecordSource': 'ebay', 'colNameRecordType': 'taxoPath', 'dataFilename': 'ebay_category_mapping.140413.tsv', 'dataColName':'Feed Category'},
{ 'fileId':16,'colNameRecordSource': 'ebay_nondelta', 'colNameRecordType': 'taxoPath', 'dataFilename': 'ebay_nondelta_category_mapping.tsv', 'dataColName':'Feed Category'},
{ 'fileId':17,'colNameRecordSource': 'ebay_uk', 'colNameRecordType': 'taxoPath', 'dataFilename': 'ebay_uk_category_mapping.tsv', 'dataColName':'Feed Category'},
{ 'fileId':18,'colNameRecordSource': 'impact_radius', 'colNameRecordType': 'taxoPath', 'dataFilename': 'impact_radius_category_mappings.tsv', 'dataColName':'Feed Category'},
{ 'fileId':19,'colNameRecordSource': 'link_share', 'colNameRecordType': 'taxoPath', 'dataFilename': 'link_share_category_mappings.ascii.tsv', 'dataColName':'Feed Category'},
{ 'fileId':20,'colNameRecordSource': 'pricegrabber', 'colNameRecordType': 'taxoPath', 'dataFilename': 'pricegrabber_category_mappings.tsv', 'dataColName':'Feed Category'},
{ 'fileId':21,'colNameRecordSource': 'pricegrabber_uk','colNameRecordType': 'taxoPath', 'dataFilename': 'pricegrabber_uk_category_mappings.tsv', 'dataColName':'Feed Category'},
{ 'fileId':22,'colNameRecordSource': 'shopping', 'colNameRecordType': 'taxoPath', 'dataFilename': 'shopping_category_mappings.tsv', 'dataColName':'Feed Category'},
{ 'fileId':23,'colNameRecordSource': 'shopzilla', 'colNameRecordType': 'taxoPath', 'dataFilename': 'shopzilla_category_mappings.tsv', 'dataColName':'Feed Category'},
{ 'fileId':24,'colNameRecordSource': 'walmart', 'colNameRecordType': 'taxoPath', 'dataFilename': 'walmart_category_mappings.tsv', 'dataColName':'Feed Category'},
{ 'fileId':25,'colNameRecordSource': 'viglink', 'colNameRecordType': 'taxoPath', 'dataFilename': 'viglink_category_mappings.tsv', 'dataColName':'Feed Category'},
{ 'fileId':27,'colNameRecordSource': 'shopzilla_uk', 'colNameRecordType': 'taxoPath', 'dataFilename': 'shopzilla_uk_category_mappings.tsv', 'dataColName':'Feed Category'},
{ 'fileId':28,'colNameRecordSource': 'amazon', 'colNameRecordType': 'taxoPath', 'dataFilename': 'amazon_category_mappings.tsv', 'dataColName':'Feed Category'},
{ 'fileId':29,'colNameRecordSource': 'amazon_uk', 'colNameRecordType': 'taxoPath', 'dataFilename': 'amazon_uk_category_mappings.tsv', 'dataColName':'Feed Category'},
])
#],index)
#if debug: print df_unlbldTaxoDataInfo.loc[fileToProcess]
df_unlbldTaxoDataInfo[(df_unlbldTaxoDataInfo.fileId==fileToProcess)]
Out[19]:
Read in the unlabeled data¶
In [20]:
# load a cherry-picked file
df = df_unlbldTaxoDataInfo[(df_unlbldTaxoDataInfo.fileId==fileToProcess)]
dataFilename = df.iloc[0]['dataFilename']
dataCode = df.iloc[0]['colNameRecordSource']
dataColName = df.iloc[0]['dataColName']
df_unlabeledData = DataFrame(pd.read_csv(dataDir + dataFilename, delimiter='\t', quoting=3, skipinitialspace=True))
#df_unlabeledData.rename(columns={'term': colNameRecordTextContent}, inplace=True)
df_unlabeledData.rename(columns={'taxoPath': colNameRecordTextContent}, inplace=True)
#df_unlabeledData = df_unlabeledData.loc[random.choice(df_unlabeledData.index, 10, replace=False)] # random sample
#df_unlabeledData.index = range(0, len(df_unlabeledData))
if debug:
print "dataFilename =", dataFilename
print "dataColName =", dataColName
print "dataCode =", dataCode
print "record count:", len(df_unlabeledData) # 16893
print "\nsample:\n", df_unlabeledData.loc[random.choice(df_unlabeledData.index, 10, replace=False)] # random sample
Clean the data¶
In [21]:
df_unlabeledData.rename(columns={dataColName:colNameRecordTextContent}, inplace=True)
# data cleanup the data
df_unlabeledData[colNameRecordTextContentOrig] = df_unlabeledData[colNameRecordTextContent] # keep the orig
df_unlabeledData[colNameRecordTextContent] = df_unlabeledData[colNameRecordTextContent].fillna('value is missing on this record') # fill missing data
#if debug: df_unlabeledData[colNameRecordTextContent].isnull()
df_unlabeledData[colNameRecordTextContent] = df_unlabeledData[colNameRecordTextContent].str.replace("/"," ")
df_unlabeledData[colNameRecordTextContent] = df_unlabeledData[colNameRecordTextContent].str.replace(":"," ")
df_unlabeledData[colNameRecordTextContent] = df_unlabeledData[colNameRecordTextContent].str.replace(","," ")
df_unlabeledData[colNameRecordTextContent] = df_unlabeledData[colNameRecordTextContent].str.replace(";"," ")
df_unlabeledData[colNameRecordTextContent] = df_unlabeledData[colNameRecordTextContent].str.replace("~"," ")
df_unlabeledData[colNameRecordTextContent] = df_unlabeledData[colNameRecordTextContent].str.replace("&"," ")
df_unlabeledData[colNameRecordTextContent] = df_unlabeledData[colNameRecordTextContent].str.replace(">"," > ")
df_unlabeledData[colNameRecordTextContent] = df_unlabeledData[colNameRecordTextContent].str.replace(" "," ")
if debug:
print "\nsample:\n", df_unlabeledData.loc[random.choice(df_unlabeledData.index, 10, replace=False)] # random sample
Read in the trained model¶
In [22]:
svm_clfr_all = joblib.load(modelFile)
Read in the featurize-extraction model(s)¶
In [23]:
# divided between two files
list_cntVectorizer = joblib.load(dictBasedUnigramVectorizerListFile)
df_popularCategoryTypes=pd.DataFrame().from_csv(dictPopularPairsFile, sep='\t')
if debug: print df_popularCategoryTypes
Extract the features¶
Create the feature vector¶
In [24]:
# create an empty array with x records
df_extrContentFeatures = pd.DataFrame(np.empty((len(df_unlabeledData.index),0)))
In [25]:
df_unlabeledData[colNameRecordTextContent].str.len()
Out[25]:
In [26]:
corpusBasedUnigramVectorizer = joblib.load(corpusBasedUnigramVectorizerFile)
In [27]:
df_extrContentFeatures['strLen'] = df_unlabeledData[colNameRecordTextContent].str.len()
df_unlabeledDataDerived = pd.DataFrame(np.empty((len(df_unlabeledData.index),0))) # shell array
df_unlabeledDataDerived['pathNodes'] = df_unlabeledData.recordTextContent.apply(lambda s: s.split(' > '))
df_unlabeledDataDerived[colNameRecordTextContentTokens] = df_unlabeledData.recordTextContent.apply(lambda s: split('[>| |&]+',s))
# present a sample
if debug>=2: print "derived data:", df_unlabeledDataDerived.loc[random.choice(df_labeledDataDerived.index, 5, replace=False)]
df_extrContentFeatures['tokensCount'] = df_unlabeledDataDerived[colNameRecordTextContentTokens].str.len()
# present a sample
# error check
if len(df_extrContentFeatures.index)<2: print "ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
# report a sample
elif debug: print "df_extrContentFeatures:\n", df_extrContentFeatures.loc[random.choice(df_extrContentFeatures.index, 5, replace=False)]
In [28]:
# time consuming
for index, record in df_popularCategoryTypes.iterrows(): # ordered as a debugging aid
productType, productCategory = record[colNameTermType], record[colNameTermCategory]
t0 = time()
cntVectorizer = list_cntVectorizer[index]
cv_list_pre = cntVectorizer.transform(df_unlabeledData[colNameRecordTextContent])
cv_list = cv_list_pre.toarray().sum(axis=1).tolist()
df_extrContentFeatures[productType+"_"+productCategory+"_terms"] = cv_list
df_extrContentFeatures[productType+"_"+productCategory+"_terms2tokens"] = cv_list / df_extrContentFeatures[colNameRecordTextContentTokensCount]
timeDelta = time() - t0
if debug: print "index:", index, "\tproductCategory:", record['category'], " productType:", record['type'], "timeDelta:", timeDelta
if debug>=3:
print cntVectorizer.transform(df_testTextItem['textItem']).toarray()
print cv_list, "\n"
# report a sample
if len(df_extrContentFeatures.index)<2: print "ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
# report a sample
elif debug: print df_extrContentFeatures.loc[random.choice(df_extrContentFeatures.index, 2, replace=True)]
In [29]:
srs_textCorpus = df_unlabeledData[colNameRecordTextContent]
# apply this feature extractor
sprs_vectorizedTokens = corpusBasedUnigramVectorizer.transform(srs_textCorpus)
from scipy.sparse import csr_matrix, issparse, isspmatrix, isspmatrix_csc, isspmatrix_csr, isspmatrix_bsr, isspmatrix_lil, isspmatrix_dok, isspmatrix_coo, isspmatrix_dia
import numpy as np
from scipy import int8
sprs_vectorizedTokens2 = csr_matrix(sprs_vectorizedTokens, dtype=int8) # squeeze int64 to save memory
narr_vectorizedTokens = sprs_vectorizedTokens2.toarray()
df_vectorizedTokens = pd.DataFrame(narr_vectorizedTokens)
if debug>=2:
label_prefix="ut"
unigramTokenFeatureNames=[label_prefix + "_" + str(i) for i in range(df_vectorizedTokens.shape[1])]
df_vectorizedTokens.columns = unigramTokenFeatureNames
print tokenDict # [u'00', u'08', u'09', u'10', u'100', u'1000', u'1001', ..., u'0042g3c1a41e', ..., u'146qqsspagenamezwdvwqqrdz1qqcmdzviewitem',
# for debugging keep the feature names
tokenDict = corpusBasedUnigramVectorizer.get_feature_names()
# report a sample
if debug: print "df_vectorizedTokens sample\n", df_vectorizedTokens.loc[random.choice(df_vectorizedTokens.index, 4, replace=False)]
In [30]:
# DEBUG: relabel the column names to be unique
if debug>=2:
#unigramTokenFeatureNames=[label_prefix+"_"+str(i) for i in range(df_vectorizedTokens.shape[1])]
unigramTokenFeatureNames=[label_prefix+"_"+tokenDict[i] for i in range(df_vectorizedTokensTest.shape[1])]
df_vectorizedTokensTest.columns = unigramTokenFeatureNames
# report a sample
df_vectorizedTokensTest.loc[random.choice(df_vectorizedTokensTest.index, 3, replace=False)]
In [31]:
#merge the separate feature spaces
#df_extrFeaturesTest=df_extrContentFeaturesTest.join(df_vectorizedTokensTest)
# fyi, using .join instead appears to be memory intensive than using merge()
df_extrFeatures = pd.merge(df_extrContentFeatures, df_vectorizedTokens, how='inner', left_index=True, right_index=True, sort=True,
suffixes=('_x', '_y'), copy=True)
print "shape: ", df_extrFeatures.shape # (16000, 7706) # (31056, 13468)
# report a sample
df_extrFeatures.loc[random.choice(df_extrFeatures.index, 3, replace=False)]
# write-out to a file
#df_extrFeaturesTest.to_csv("df_extrFeaturesTest." + dstamp + ".csv", sep='\t', encoding='utf-8')
Out[31]:
In [32]:
#df_extrFeaturesSlice = df_extrFeatures[0:1]
#ndarr_preds_All = svm_clfr_all.predict(df_extrFeaturesSlice)
ndarr_preds_All = svm_clfr_all.predict(df_extrFeatures)
# ndarr_preds_All_Test[0:5]
# array(['n/a', 'n/a', 'CB>OT', 'n/a', 'n/a'], dtype=object)
gc.collect
ndarr_preds_All_Score = svm_clfr_all.decision_function(df_extrFeatures)
#ndarr_preds_All_Score[:3]
#import numpy as np
ndarr_preds_All_Score = np.column_stack((ndarr_preds_All,ndarr_preds_All_Score.max(axis=1)))
ndarr_preds_All_Score
# array([ ['CT>HW', -0.7373890220954527], ['n/a', -0.8697072603624554], ['CT>HW', -0.6463626722157896], ...
Out[32]:
In [33]:
df_predLabels = pd.DataFrame(ndarr_preds_All_Score)
df_predLabels.columns = ["taxoLabel","predScore"]
# report a sample
df_predLabels.loc[random.choice(df_predLabels.index, 3, replace=False)]
Out[33]:
In [34]:
df_predictions = df_unlabeledData.join(df_predLabels)
predsFile="df_predictions." + dataCode + "." + dstamp + ".tsv"
df_predictions.to_csv(predsFile, sep='\t')
if debug:
print df_predictions.loc[random.choice(df_predictions.index, 25, replace=False)]
In [ ]: