One hot encoding for categorical variables with Numpy, Pandas and Scikit-learn
Load sample dataset from Kaggle
import numpy as np
import pandas as pd
df = pd.read_csv('datasets.csv')
df
The example data view. ID is the serial number of each distance. y is the numeric dependent variable. X0, X1, X2, X3, X4 are categorical independent variables. X10, X11, X12, X13, X14, X15, X16, X17 are binary variables.
ID | y | X0 | X1 | X2 | X3 | X4 | X10 | X11 | X12 | X13 | X14 | X15 | X16 | X17 |
0 | 130.81 | k | v | at | a | d | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
6 | 88.53 | k | t | av | e | d | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
7 | 76.26 | az | w | n | c | d | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
9 | 80.62 | az | t | n | f | d | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
13 | 78.02 | az | v | n | f | d | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
18 | 92.93 | t | b | e | c | d | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
24 | 128.76 | al | r | e | f | d | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
List all features before encoding
df.columns.tolist()
# ['ID', 'y', 'X0', 'X1', 'X2', 'X3', 'X4',
# 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17']
Two approaches could be applied for one-hot encoding (1) Pandas (2) Scikit-learn
The classes in the first three categorical features
pd.get_dummies(MB_df['X0']).keys().tolist()
# ['al', 'az', 'k', 't']
pd.get_dummies(MB_df['X1']).keys().tolist()
# ['b', 'r', 't', 'v', 'w']
pd.get_dummies(MB_df['X2']).keys().tolist()
# ['at', 'av', 'e', 'n']
Apply one-hot encoding with Pandas
pd.get_dummies(df['X0']).values
# array([[ 0., 0., 1., 0.],
# [ 0., 0., 1., 0.],
# [ 0., 1., 0., 0.],
# [ 0., 1., 0., 0.],
# [ 0., 1., 0., 0.],
# [ 0., 0., 0., 1.],
# [ 1., 0., 0., 0.]])
pd.get_dummies(df['X1']).values
# array([[ 0., 0., 0., 1., 0.],
# [ 0., 0., 1., 0., 0.],
# [ 0., 0., 0., 0., 1.],
# [ 0., 0., 1., 0., 0.],
# [ 0., 0., 0., 1., 0.],
# [ 1., 0., 0., 0., 0.],
# [ 0., 1., 0., 0., 0.]])
pd.get_dummies(df['X2']).values
# array([[ 1., 0., 0., 0.],
# [ 0., 1., 0., 0.],
# [ 0., 0., 0., 1.],
# [ 0., 0., 0., 1.],
# [ 0., 0., 0., 1.],
# [ 0., 0., 1., 0.],
# [ 0., 0., 1., 0.]])
Concatenate all encoded features. In reduce function, axis=1 means concatenation in horizontal direction.
encoded_mat = []
encoded_feature = []
for feature in cat_columns:
encoded_mat.append(pd.get_dummies(df[feature]).values)
feature_list.extend(map(lambda x: "{}:{}".format(feature, x), pd.get_dummies(df[feature]).keys().tolist()))
reduce(lambda x, y: np.append(x, y, axis=1), encoded_mat)
# array([[ 0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1.],
# [ 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1.],
# [ 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 1.],
# [ 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1.],
# [ 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1.],
# [ 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 1.],
# [ 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1.]])
encoded_feature
# ['X0:al', 'X0:az', 'X0:k', 'X0:t', 'X1:b', 'X1:r', 'X1:t', 'X1:v', 'X1:w',
# 'X2:at', 'X2:av', 'X2:e', 'X2:n', 'X3:a', 'X3:c', 'X3:e', 'X3:f', 'X4:d']
Apply one-hot encoding with Scikit-learn
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
User LabelEncoder transform nomial variables into numeric variables
new_data = []
feature_class = []
le = LabelEncoder()
categorical_column = ["X0", "X1", "X2", "X3", "X4"]
for feature_name in categorical_column:
new_data.append(le.fit_transform(df[feature_name]))
feature_class.extend(map(lambda x:feature_name+"={}".format(x), le.classes_))
le.fit_transform(df['X0'])
#[2 2 1 1 1 3 0]
le.classes
# ['al', 'az', 'k', 't']
The attributes n_values_
and feature_indices_
in OneHotEncoder are used for inspecting feature number and feature index for each catogorical column.
enc = OneHotEncoder()
enc.fit(map(list,zip(*new_data)))
enc.n_values_
# [4 5 4 4 1]
enc.feature_indices_
# [4 9 13 17 18]
Remove original categorical variables from data table
for i in categorical_column:
if i in df.columns:
del df[i]
Combine all encoded features with original datasets (after removal of unencoded variables) and create new tables.
pd.concat([df, pd.DataFrame(enc.transform(map(list,zip(*new_data))).toarray(), columns=feature_class)] , axis=1)
ID | y | X10 | X11 | X12 | X13 | X14 | X15 | X16 | ... | ... | X1=w | X2=at | X2=av | X2=e | X2=n | X3=a | X3=c | X3=e | X3=f | X4=d |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 130.81 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
6 | 88.53 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 |
7 | 76.26 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
9 | 80.62 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 |
13 | 78.02 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 |
18 | 92.93 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
24 | 128.76 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 |
a simpler version
OneHotEncoder().fit_transform(LabelEncoder().fit_transform(df['X0']).reshape(-1,1)).toarray()
# array([[ 0., 0., 1., 0.],
# [ 0., 0., 1., 0.],
# [ 0., 1., 0., 0.],
# [ 0., 1., 0., 0.],
# [ 0., 1., 0., 0.],
# [ 0., 0., 0., 1.],
# [ 1., 0., 0., 0.]])
One hot-encoding with NumPy
import numpy as np
num_labels = len(np.unique(target))
all_Y = np.eye(num_labels)[target]
print num_labels
# 3
print target
# [2,2,2,1,0]
print all_Y
# array([[ 0., 0., 1.],
# [ 0., 0., 1.],
# [ 0., 0., 1.],
# [ 0., 1., 0.],
# [ 1., 0., 0.]])