이것은 사실 이후 1 년 반이지만, 나는 .transform()
팬더 데이터 프레임 열을 한 번 에 여러 개 (그리고 .inverse_transform()
그것들도 가능하게) 할 수 있어야했습니다 . 이것은 위의 @PriceHardman의 훌륭한 제안으로 확장됩니다.
class MultiColumnLabelEncoder(LabelEncoder):
"""
Wraps sklearn LabelEncoder functionality for use on multiple columns of a
pandas dataframe.
"""
def __init__(self, columns=None):
self.columns = columns
def fit(self, dframe):
"""
Fit label encoder to pandas columns.
Access individual column classes via indexig `self.all_classes_`
Access individual column encoders via indexing
`self.all_encoders_`
"""
# if columns are provided, iterate through and get `classes_`
if self.columns is not None:
# ndarray to hold LabelEncoder().classes_ for each
# column; should match the shape of specified `columns`
self.all_classes_ = np.ndarray(shape=self.columns.shape,
dtype=object)
self.all_encoders_ = np.ndarray(shape=self.columns.shape,
dtype=object)
for idx, column in enumerate(self.columns):
# fit LabelEncoder to get `classes_` for the column
le = LabelEncoder()
le.fit(dframe.loc[:, column].values)
# append the `classes_` to our ndarray container
self.all_classes_[idx] = (column,
np.array(le.classes_.tolist(),
dtype=object))
# append this column's encoder
self.all_encoders_[idx] = le
else:
# no columns specified; assume all are to be encoded
self.columns = dframe.iloc[:, :].columns
self.all_classes_ = np.ndarray(shape=self.columns.shape,
dtype=object)
for idx, column in enumerate(self.columns):
le = LabelEncoder()
le.fit(dframe.loc[:, column].values)
self.all_classes_[idx] = (column,
np.array(le.classes_.tolist(),
dtype=object))
self.all_encoders_[idx] = le
return self
def fit_transform(self, dframe):
"""
Fit label encoder and return encoded labels.
Access individual column classes via indexing
`self.all_classes_`
Access individual column encoders via indexing
`self.all_encoders_`
Access individual column encoded labels via indexing
`self.all_labels_`
"""
# if columns are provided, iterate through and get `classes_`
if self.columns is not None:
# ndarray to hold LabelEncoder().classes_ for each
# column; should match the shape of specified `columns`
self.all_classes_ = np.ndarray(shape=self.columns.shape,
dtype=object)
self.all_encoders_ = np.ndarray(shape=self.columns.shape,
dtype=object)
self.all_labels_ = np.ndarray(shape=self.columns.shape,
dtype=object)
for idx, column in enumerate(self.columns):
# instantiate LabelEncoder
le = LabelEncoder()
# fit and transform labels in the column
dframe.loc[:, column] =\
le.fit_transform(dframe.loc[:, column].values)
# append the `classes_` to our ndarray container
self.all_classes_[idx] = (column,
np.array(le.classes_.tolist(),
dtype=object))
self.all_encoders_[idx] = le
self.all_labels_[idx] = le
else:
# no columns specified; assume all are to be encoded
self.columns = dframe.iloc[:, :].columns
self.all_classes_ = np.ndarray(shape=self.columns.shape,
dtype=object)
for idx, column in enumerate(self.columns):
le = LabelEncoder()
dframe.loc[:, column] = le.fit_transform(
dframe.loc[:, column].values)
self.all_classes_[idx] = (column,
np.array(le.classes_.tolist(),
dtype=object))
self.all_encoders_[idx] = le
return dframe.loc[:, self.columns].values
def transform(self, dframe):
"""
Transform labels to normalized encoding.
"""
if self.columns is not None:
for idx, column in enumerate(self.columns):
dframe.loc[:, column] = self.all_encoders_[
idx].transform(dframe.loc[:, column].values)
else:
self.columns = dframe.iloc[:, :].columns
for idx, column in enumerate(self.columns):
dframe.loc[:, column] = self.all_encoders_[idx]\
.transform(dframe.loc[:, column].values)
return dframe.loc[:, self.columns].values
def inverse_transform(self, dframe):
"""
Transform labels back to original encoding.
"""
if self.columns is not None:
for idx, column in enumerate(self.columns):
dframe.loc[:, column] = self.all_encoders_[idx]\
.inverse_transform(dframe.loc[:, column].values)
else:
self.columns = dframe.iloc[:, :].columns
for idx, column in enumerate(self.columns):
dframe.loc[:, column] = self.all_encoders_[idx]\
.inverse_transform(dframe.loc[:, column].values)
return dframe.loc[:, self.columns].values
예:
경우 df
및 df_copy()
혼합 유형이다 pandas
dataframes, 당신은을 적용 할 수 있습니다 MultiColumnLabelEncoder()
받는 dtype=object
다음과 같은 방법으로 열 :
# get `object` columns
df_object_columns = df.iloc[:, :].select_dtypes(include=['object']).columns
df_copy_object_columns = df_copy.iloc[:, :].select_dtypes(include=['object']).columns
# instantiate `MultiColumnLabelEncoder`
mcle = MultiColumnLabelEncoder(columns=object_columns)
# fit to `df` data
mcle.fit(df)
# transform the `df` data
mcle.transform(df)
# returns output like below
array([[1, 0, 0, ..., 1, 1, 0],
[0, 5, 1, ..., 1, 1, 2],
[1, 1, 1, ..., 1, 1, 2],
...,
[3, 5, 1, ..., 1, 1, 2],
# transform `df_copy` data
mcle.transform(df_copy)
# returns output like below (assuming the respective columns
# of `df_copy` contain the same unique values as that particular
# column in `df`
array([[1, 0, 0, ..., 1, 1, 0],
[0, 5, 1, ..., 1, 1, 2],
[1, 1, 1, ..., 1, 1, 2],
...,
[3, 5, 1, ..., 1, 1, 2],
# inverse `df` data
mcle.inverse_transform(df)
# outputs data like below
array([['August', 'Friday', '2013', ..., 'N', 'N', 'CA'],
['April', 'Tuesday', '2014', ..., 'N', 'N', 'NJ'],
['August', 'Monday', '2014', ..., 'N', 'N', 'NJ'],
...,
['February', 'Tuesday', '2014', ..., 'N', 'N', 'NJ'],
['April', 'Tuesday', '2014', ..., 'N', 'N', 'NJ'],
['March', 'Tuesday', '2013', ..., 'N', 'N', 'NJ']], dtype=object)
# inverse `df_copy` data
mcle.inverse_transform(df_copy)
# outputs data like below
array([['August', 'Friday', '2013', ..., 'N', 'N', 'CA'],
['April', 'Tuesday', '2014', ..., 'N', 'N', 'NJ'],
['August', 'Monday', '2014', ..., 'N', 'N', 'NJ'],
...,
['February', 'Tuesday', '2014', ..., 'N', 'N', 'NJ'],
['April', 'Tuesday', '2014', ..., 'N', 'N', 'NJ'],
['March', 'Tuesday', '2013', ..., 'N', 'N', 'NJ']], dtype=object)
인덱싱을 통해 각 열을 맞추는 데 사용되는 개별 열 클래스, 열 레이블 및 열 인코더에 액세스 할 수 있습니다.
mcle.all_classes_
mcle.all_encoders_
mcle.all_labels_