如何在scikit-learn的机器学习流程中添加自定义中间预处理器来处理n-gram列？

在机器学习预处理步骤中处理n-gram变量（如SUBSTRING_4L_V3）给我带来了一些问题。

我能够分别转换和标准化数值、分类和n-gram变量，

import pandas as pdimport numpy as npfrom sklearn.model_selection import train_test_splitfrom sklearn import preprocessingfrom sklearn.impute import SimpleImputerfrom sklearn.feature_extraction.text import CountVectorizerdata = {    'AGE': [39, np.nan, 21, 13, 45, 26, np.nan, 48],    'URBAN': ['urban', np.nan, 'urban', 'rural', 'urban', 'rural', 'urban', 'urban'],    'NAME': ['jack', 'juste', 'ann', np.nan, 'jack', 'gil', 'phil', 'justo'],    'SUBSTRING_4L': [['jack'], ['just', 'uste'], [], [], ['jack'], [], ['phil'], ['just', 'usto']],    'SUBSTRING_4L_V2': [['jack'], ['just, uste'], [], [], ['jack'], [], ['phil'], ['just, usto']],    'SUBSTRING_4L_V3': ['jack', 'just, uste', '', '', 'jack', '', 'phil', 'just, usto'],    'SUBSTRING_5L': [[], ['juste'], [], [], [], [], [], ['justo']],    'DISEASE': ['healthy', 'cancer', 'cancer', 'dementia', 'cancer', 'heart', 'healthy', 'cancer'],    }df = pd.DataFrame(data)def transform_numerical():    x_train, x_test, y_train, y_test = train_test_split(        df[['AGE']], df['DISEASE'], test_size=0.5, random_state=3)    scaler = preprocessing.StandardScaler().fit(x_train)    x_trainT = scaler.transform(x_train)    x_testT = scaler.transform(x_test)    print(x_train)    print(x_trainT)    print()    print(x_test)    print(x_testT)    print('/////////////////////////', '\n')transform_numerical()def transform_categorical():    x_train, x_test, y_train, y_test = train_test_split(        df[['URBAN', 'NAME']], df['DISEASE'], test_size=0.5, random_state=3)    cat_imputer = SimpleImputer(strategy='constant', fill_value='')    cat_imputer.fit(x_train)    x_trainT = cat_imputer.transform(x_train)    x_testT = cat_imputer.transform(x_test)    encoder = preprocessing.OneHotEncoder(handle_unknown='ignore')    encoder.fit(x_trainT)    x_trainT = encoder.transform(x_trainT)    x_testT = encoder.transform(x_testT)    print(x_trainT.toarray())    print(x_train)    print()    print(x_testT.toarray())    print(x_test)    print('/////////////////////////', '\n')transform_categorical()def transform_list():    x_train, x_test, y_train, y_test = train_test_split(        df[['SUBSTRING_4L_V3']], df['DISEASE'], test_size=0.5, random_state=3)    cat_imputer = SimpleImputer(strategy='constant', fill_value='')    cat_imputer.fit(x_train)    x_trainT = cat_imputer.transform(x_train)    x_testT = cat_imputer.transform(x_test)    x_trainT = x_trainT.ravel()    x_testT = x_testT.ravel()    count_vect = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features=5000)     x_trainT = count_vect.fit_transform(x_trainT)    print(x_trainT.toarray())    print('/////////////////////////', '\n')transform_list()

对于SUBSTRING_4L_V3，我需要通过ravel()来展平它，然后再应用CountVectorizer()。

然而，我不熟悉如何在ML流程中顺序实现它们，下面是我的尝试：

import pandas as pdimport numpy as npfrom sklearn.model_selection import train_test_split, GridSearchCVfrom sklearn import preprocessingfrom sklearn.impute import SimpleImputerfrom sklearn.compose import ColumnTransformerfrom sklearn.pipeline import Pipeline, make_pipelinefrom sklearn.impute import SimpleImputerfrom sklearn.preprocessing import StandardScaler, OneHotEncoderfrom sklearn.feature_extraction.text import CountVectorizerfrom sklearn.base import BaseEstimator, TransformerMixinfrom sklearn.svm import LinearSVC, SVCfrom sklearn.linear_model import LogisticRegressionclass RavelTransformer(BaseEstimator, TransformerMixin):    def __init__(self):        pass    def fit(self, X, y=None):        return self.ravel()data = {    'AGE': [39, np.nan, 21, 13, 45, 26, np.nan, 48],    'URBAN': ['urban', np.nan, 'urban', 'rural', 'urban', 'rural', 'urban', 'urban'],    'NAME': ['jack', 'juste', 'ann', np.nan, 'jack', 'gil', 'phil', 'justo'],    'SUBSTRING_4L': [['jack'], ['just', 'uste'], [], [], ['jack'], [], ['phil'], ['just', 'usto']],    'SUBSTRING_4L_V2': [['jack'], ['just, uste'], [], [], ['jack'], [], ['phil'], ['just, usto']],    'SUBSTRING_4L_V3': ['jack', 'just, uste', '', '', 'jack', '', 'phil', 'just, usto'],    'SUBSTRING_5L': [[], ['juste'], [], [], [], [], [], ['justo']],    'DISEASE': ['healthy', 'cancer', 'cancer', 'dementia', 'cancer', 'heart', 'healthy', 'cancer'],    }df = pd.DataFrame(data)x_train, x_test, y_train, y_test = train_test_split(    df[['AGE', 'NAME', 'URBAN', 'SUBSTRING_4L_V3']], df['DISEASE'], test_size=0.5, random_state=3)transformer_num = Pipeline(steps=[    ('imputer', SimpleImputer(strategy='median')),    ('scaler', StandardScaler())])transformer_cat = Pipeline(steps=[    ('imputer', SimpleImputer(strategy='constant', fill_value='')),    ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))])transformer_ngram = Pipeline(steps=[    ('imputer', SimpleImputer(strategy='constant', fill_value='')),    ('ravel', RavelTransformer()),    ('countvectorizer', CountVectorizer(analyzer='word', tokenizer=None, preprocessor=None, stop_words=None,         max_features=5000))])preprocessor = ColumnTransformer(    transformers=[        ('num', transformer_num, ['AGE']),        ('cat', transformer_cat, ['NAME', 'URBAN']),        ('ngram', transformer_ngram, ['SUBSTRING_4L_V3']),        ])ml_algo = LogisticRegression(solver='lbfgs', multi_class='ovr', max_iter=4000)model = make_pipeline(preprocessor, ml_algo)model.fit(x_train, y_train)#print('Model score: %.3f' % model.score(x_test, y_test))

错误：

TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'RavelTransformer()' (type <class '__main__.RavelTranformer'>) doesn't

回答：

错误信息告诉您，您的RavelTransformer类中没有transform函数。

我的假设是您想要做类似这样的事情：

class RavelTransformer(BaseEstimator, TransformerMixin):    def __init__(self):        pass    def fit(self, X, y=None):        return self    def transform(self, X, y=None):        return X.ravel()

在这里，您的RavelTransformer在fit步骤中不做任何事情，但在transform步骤中如预期的那样通过raveling来转换数据。

学技术

如何在scikit-learn的机器学习流程中添加自定义中间预处理器来处理n-gram列？

发表回复取消回复

相关文章：

Related Posts

使用LSTM在Python中预测未来值

如何在gensim的word2vec模型中查找双词组的相似性

dask_xgboost.predict 可以工作但无法显示 – 数据必须是一维的

ML Tuning – Cross Validation in Spark

如何在React JS中使用fetch从REST API获取预测

如何分析ML.NET中多类分类预测得分数组？

发表回复 取消回复

发表回复取消回复