Data preprocessing (1) One hot encoding for categorical variables

One hot encoding for categorical variables with Numpy, Pandas and Scikit-learn

Load sample dataset from Kaggle

import numpy as np
import pandas as pd
df = pd.read_csv('datasets.csv')
df

The example data view. ID is the serial number of each distance. y is the numeric dependent variable.  X0, X1, X2, X3, X4 are categorical independent variables. X10, X11, X12, X13, X14, X15, X16, X17 are binary variables.

ID y X0 X1 X2 X3 X4 X10 X11 X12 X13 X14 X15 X16 X17
0 130.81 k v at a d 0 0 0 1 0 0 0 0
6 88.53 k t av e d 0 0 0 0 0 0 0 0
7 76.26 az w n c d 0 0 0 0 0 0 0 1
9 80.62 az t n f d 0 0 0 0 0 0 0 0
13 78.02 az v n f d 0 0 0 0 0 0 0 0
18 92.93 t b e c d 0 0 0 0 1 0 0 0
24 128.76 al r e f d 0 0 0 0 1 0 0 0

List all features before encoding

df.columns.tolist()
# ['ID', 'y', 'X0', 'X1', 'X2', 'X3', 'X4',
#  'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17']

Two approaches could be applied for one-hot encoding (1) Pandas (2) Scikit-learn

The classes in the first three categorical features

pd.get_dummies(MB_df['X0']).keys().tolist()
# ['al', 'az', 'k', 't']
pd.get_dummies(MB_df['X1']).keys().tolist()
# ['b', 'r', 't', 'v', 'w']
pd.get_dummies(MB_df['X2']).keys().tolist()
# ['at', 'av', 'e', 'n']

Apply one-hot encoding with Pandas

pd.get_dummies(df['X0']).values
# array([[ 0.,  0.,  1.,  0.],
#        [ 0.,  0.,  1.,  0.],
#        [ 0.,  1.,  0.,  0.],
#        [ 0.,  1.,  0.,  0.],
#        [ 0.,  1.,  0.,  0.],
#        [ 0.,  0.,  0.,  1.],
#        [ 1.,  0.,  0.,  0.]])

pd.get_dummies(df['X1']).values
# array([[ 0.,  0.,  0.,  1.,  0.],
#        [ 0.,  0.,  1.,  0.,  0.],
#        [ 0.,  0.,  0.,  0.,  1.],
#        [ 0.,  0.,  1.,  0.,  0.],
#        [ 0.,  0.,  0.,  1.,  0.],
#        [ 1.,  0.,  0.,  0.,  0.],
#        [ 0.,  1.,  0.,  0.,  0.]])

pd.get_dummies(df['X2']).values
# array([[ 1.,  0.,  0.,  0.],
#        [ 0.,  1.,  0.,  0.],
#        [ 0.,  0.,  0.,  1.],
#        [ 0.,  0.,  0.,  1.],
#        [ 0.,  0.,  0.,  1.],
#        [ 0.,  0.,  1.,  0.],
#        [ 0.,  0.,  1.,  0.]])

Concatenate all encoded features. In reduce function, axis=1 means concatenation in horizontal direction.

encoded_mat = []
encoded_feature = []
for feature in cat_columns:
    encoded_mat.append(pd.get_dummies(df[feature]).values)
    feature_list.extend(map(lambda x: "{}:{}".format(feature, x), pd.get_dummies(df[feature]).keys().tolist()))
reduce(lambda x, y: np.append(x, y, axis=1), encoded_mat)
# array([[ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.],
#        [ 0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  1.],
#        [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  1.],
#        [ 0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  1.],
#        [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  1.],
#        [ 0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  1.],
#        [ 1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  1.]])

encoded_feature
# ['X0:al', 'X0:az', 'X0:k', 'X0:t', 'X1:b', 'X1:r', 'X1:t', 'X1:v', 'X1:w',
#  'X2:at', 'X2:av', 'X2:e', 'X2:n', 'X3:a', 'X3:c', 'X3:e', 'X3:f', 'X4:d']

Apply one-hot encoding with Scikit-learn

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

User LabelEncoder transform nomial variables into numeric variables

new_data = []
feature_class = []
le = LabelEncoder()
categorical_column = ["X0", "X1", "X2", "X3", "X4"]
for feature_name in categorical_column:
    new_data.append(le.fit_transform(df[feature_name]))
    feature_class.extend(map(lambda x:feature_name+"={}".format(x), le.classes_))

le.fit_transform(df['X0'])
#[2 2 1 1 1 3 0]
le.classes
# ['al', 'az', 'k', 't']

The attributes n_values_ and feature_indices_ in OneHotEncoder are used for inspecting feature number and feature index for each catogorical column.

enc = OneHotEncoder()
enc.fit(map(list,zip(*new_data)))
enc.n_values_
# [4 5 4 4 1]
enc.feature_indices_
# [4 9 13 17 18]

Remove original categorical variables from data table

for i in categorical_column:
    if i in df.columns:
        del df[i]

Combine all encoded features with original datasets (after removal of unencoded variables) and create new tables.

pd.concat([df, pd.DataFrame(enc.transform(map(list,zip(*new_data))).toarray(), columns=feature_class)] , axis=1)
ID y X10 X11 X12 X13 X14 X15 X16 ... ... X1=w X2=at X2=av X2=e X2=n X3=a X3=c X3=e X3=f X4=d
0 130.81 0 0 0 1 0 0 0 0 ... 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
6 88.53 0 0 0 0 0 0 0 0 ... 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0
7 76.26 0 0 0 0 0 0 0 1 ... 1.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0
9 80.62 0 0 0 0 0 0 0 0 ... 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 1.0
13 78.02 0 0 0 0 0 0 0 0 ... 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 1.0
18 92.93 0 0 0 0 1 0 0 0 ... 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0
24 128.76 0 0 0 0 1 0 0 0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 1.0

a simpler version

OneHotEncoder().fit_transform(LabelEncoder().fit_transform(df['X0']).reshape(-1,1)).toarray()
# array([[ 0.,  0.,  1.,  0.],
#        [ 0.,  0.,  1.,  0.],
#        [ 0.,  1.,  0.,  0.],
#        [ 0.,  1.,  0.,  0.],
#        [ 0.,  1.,  0.,  0.],
#        [ 0.,  0.,  0.,  1.],
#        [ 1.,  0.,  0.,  0.]])

One hot-encoding with NumPy

import numpy as np

num_labels = len(np.unique(target))
all_Y = np.eye(num_labels)[target]

print num_labels
# 3
print target

# [2,2,2,1,0]

print all_Y
# array([[ 0.,  0.,  1.],
#        [ 0.,  0.,  1.],
#        [ 0.,  0.,  1.],
#        [ 0.,  1.,  0.],
#        [ 1.,  0.,  0.]])