在机器学习预处理步骤中处理n-gram变量(如SUBSTRING_4L_V3
)给我带来了一些问题。
我能够分别转换和标准化数值、分类和n-gram变量,
import pandas as pdimport numpy as npfrom sklearn.model_selection import train_test_splitfrom sklearn import preprocessingfrom sklearn.impute import SimpleImputerfrom sklearn.feature_extraction.text import CountVectorizerdata = { 'AGE': [39, np.nan, 21, 13, 45, 26, np.nan, 48], 'URBAN': ['urban', np.nan, 'urban', 'rural', 'urban', 'rural', 'urban', 'urban'], 'NAME': ['jack', 'juste', 'ann', np.nan, 'jack', 'gil', 'phil', 'justo'], 'SUBSTRING_4L': [['jack'], ['just', 'uste'], [], [], ['jack'], [], ['phil'], ['just', 'usto']], 'SUBSTRING_4L_V2': [['jack'], ['just, uste'], [], [], ['jack'], [], ['phil'], ['just, usto']], 'SUBSTRING_4L_V3': ['jack', 'just, uste', '', '', 'jack', '', 'phil', 'just, usto'], 'SUBSTRING_5L': [[], ['juste'], [], [], [], [], [], ['justo']], 'DISEASE': ['healthy', 'cancer', 'cancer', 'dementia', 'cancer', 'heart', 'healthy', 'cancer'], }df = pd.DataFrame(data)def transform_numerical(): x_train, x_test, y_train, y_test = train_test_split( df[['AGE']], df['DISEASE'], test_size=0.5, random_state=3) scaler = preprocessing.StandardScaler().fit(x_train) x_trainT = scaler.transform(x_train) x_testT = scaler.transform(x_test) print(x_train) print(x_trainT) print() print(x_test) print(x_testT) print('/////////////////////////', '\n')transform_numerical()def transform_categorical(): x_train, x_test, y_train, y_test = train_test_split( df[['URBAN', 'NAME']], df['DISEASE'], test_size=0.5, random_state=3) cat_imputer = SimpleImputer(strategy='constant', fill_value='') cat_imputer.fit(x_train) x_trainT = cat_imputer.transform(x_train) x_testT = cat_imputer.transform(x_test) encoder = preprocessing.OneHotEncoder(handle_unknown='ignore') encoder.fit(x_trainT) x_trainT = encoder.transform(x_trainT) x_testT = encoder.transform(x_testT) print(x_trainT.toarray()) print(x_train) print() print(x_testT.toarray()) print(x_test) print('/////////////////////////', '\n')transform_categorical()def transform_list(): x_train, x_test, y_train, y_test = train_test_split( df[['SUBSTRING_4L_V3']], df['DISEASE'], test_size=0.5, random_state=3) cat_imputer = SimpleImputer(strategy='constant', fill_value='') cat_imputer.fit(x_train) x_trainT = cat_imputer.transform(x_train) x_testT = cat_imputer.transform(x_test) x_trainT = x_trainT.ravel() x_testT = x_testT.ravel() count_vect = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features=5000) x_trainT = count_vect.fit_transform(x_trainT) print(x_trainT.toarray()) print('/////////////////////////', '\n')transform_list()
对于SUBSTRING_4L_V3
,我需要通过ravel()
来展平它,然后再应用CountVectorizer()
。
然而,我不熟悉如何在ML流程中顺序实现它们,下面是我的尝试:
import pandas as pdimport numpy as npfrom sklearn.model_selection import train_test_split, GridSearchCVfrom sklearn import preprocessingfrom sklearn.impute import SimpleImputerfrom sklearn.compose import ColumnTransformerfrom sklearn.pipeline import Pipeline, make_pipelinefrom sklearn.impute import SimpleImputerfrom sklearn.preprocessing import StandardScaler, OneHotEncoderfrom sklearn.feature_extraction.text import CountVectorizerfrom sklearn.base import BaseEstimator, TransformerMixinfrom sklearn.svm import LinearSVC, SVCfrom sklearn.linear_model import LogisticRegressionclass RavelTransformer(BaseEstimator, TransformerMixin): def __init__(self): pass def fit(self, X, y=None): return self.ravel()data = { 'AGE': [39, np.nan, 21, 13, 45, 26, np.nan, 48], 'URBAN': ['urban', np.nan, 'urban', 'rural', 'urban', 'rural', 'urban', 'urban'], 'NAME': ['jack', 'juste', 'ann', np.nan, 'jack', 'gil', 'phil', 'justo'], 'SUBSTRING_4L': [['jack'], ['just', 'uste'], [], [], ['jack'], [], ['phil'], ['just', 'usto']], 'SUBSTRING_4L_V2': [['jack'], ['just, uste'], [], [], ['jack'], [], ['phil'], ['just, usto']], 'SUBSTRING_4L_V3': ['jack', 'just, uste', '', '', 'jack', '', 'phil', 'just, usto'], 'SUBSTRING_5L': [[], ['juste'], [], [], [], [], [], ['justo']], 'DISEASE': ['healthy', 'cancer', 'cancer', 'dementia', 'cancer', 'heart', 'healthy', 'cancer'], }df = pd.DataFrame(data)x_train, x_test, y_train, y_test = train_test_split( df[['AGE', 'NAME', 'URBAN', 'SUBSTRING_4L_V3']], df['DISEASE'], test_size=0.5, random_state=3)transformer_num = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])transformer_cat = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='constant', fill_value='')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))])transformer_ngram = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='constant', fill_value='')), ('ravel', RavelTransformer()), ('countvectorizer', CountVectorizer(analyzer='word', tokenizer=None, preprocessor=None, stop_words=None, max_features=5000))])preprocessor = ColumnTransformer( transformers=[ ('num', transformer_num, ['AGE']), ('cat', transformer_cat, ['NAME', 'URBAN']), ('ngram', transformer_ngram, ['SUBSTRING_4L_V3']), ])ml_algo = LogisticRegression(solver='lbfgs', multi_class='ovr', max_iter=4000)model = make_pipeline(preprocessor, ml_algo)model.fit(x_train, y_train)#print('Model score: %.3f' % model.score(x_test, y_test))
错误:
TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'RavelTransformer()' (type <class '__main__.RavelTranformer'>) doesn't
回答:
错误信息告诉您,您的RavelTransformer
类中没有transform
函数。
我的假设是您想要做类似这样的事情:
class RavelTransformer(BaseEstimator, TransformerMixin): def __init__(self): pass def fit(self, X, y=None): return self def transform(self, X, y=None): return X.ravel()
在这里,您的RavelTransformer
在fit
步骤中不做任何事情,但在transform
步骤中如预期的那样通过raveling来转换数据。