sklearn StackingClassifier 和样本权重

如何解决sklearn StackingClassifier 和样本权重

我有一个类似于

的堆叠工作流程
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import make_pipeline
import xgboost as xgb

X = np.random.random(size=(1000,5))
y = np.random.choice([0,1],1000)
w = np.random.random(size=(1000,))

scaler = StandardScaler()
log_reg = LogisticRegression()

params = {
    'n_estimators': 10,'max_depth': 3,'learning_rate': 0.1
}

log_reg_pipe = make_pipeline(
    scaler,log_reg
)

stack_pipe = make_pipeline(
    StackingClassifier(
        estimators=[('lr',lr_stack_pipe)],final_estimator=xgb.XGBClassifier(**params),passthrough=True,cv=2
    )
)

我希望能够将样本权重传递到 xgboost。我的问题是如何在最终估算器中设置样本权重?

我试过了

stack_pipe.fit(X,y,sample_weights=w) 抛出

ValueError: Pipeline.fit does not accept the sample_weights parameter. You can pass parameters to specific steps of your pipeline using the stepname__parameter format,e.g. `Pipeline.fit(X,logisticregression__sample_weight=sample_weight)`

解决方法

我最近也意识到堆叠估计器无法处理样本加权管道。我通过从 scikit-learn 继承 StackingRegressorStackingClassifier 类并覆盖其 fit() 方法以更好地管理流水线来解决这个问题。请看以下内容:

"""Implement StackingClassifier that can handle sample-weighted Pipelines."""

from sklearn.ensemble import StackingRegressor,StackingClassifier
from copy import deepcopy

import numpy as np
from joblib import Parallel

from sklearn.base import clone
from sklearn.base import is_classifier,is_regressor

from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import check_cv

from sklearn.utils import Bunch
from sklearn.utils.fixes import delayed

from sklearn.pipeline import Pipeline

ESTIMATOR_NAME_IN_PIPELINE = 'estimator'

def new_fit_single_estimator(estimator,X,y,sample_weight=None,message_clsname=None,message=None):
    """Private function used to fit an estimator within a job."""
    if sample_weight is not None:
        try:
            if isinstance(estimator,Pipeline):
                # determine name of final estimator
                estimator_name = estimator.steps[-1][0]
                kwargs = {estimator_name + '__sample_weight': sample_weight}
                estimator.fit(X,**kwargs)
            else:
                estimator.fit(X,sample_weight=sample_weight)
        except TypeError as exc:
            if "unexpected keyword argument 'sample_weight'" in str(exc):
                raise TypeError(
                    "Underlying estimator {} does not support sample weights."
                    .format(estimator.__class__.__name__)
                ) from exc
            raise
    else:
        estimator.fit(X,y)
    return estimator


class FlexibleStackingClassifier(StackingClassifier):

    def __init__(self,estimators,final_estimator=None,*,cv=None,n_jobs=None,passthrough=False,verbose=0):
        super().__init__(
            estimators=estimators,final_estimator=final_estimator,cv=cv,n_jobs=n_jobs,passthrough=passthrough,verbose=verbose
        )

    def fit(self,sample_weight=None):
        """Fit the estimators.

        Parameters
        ----------
        X : {array-like,sparse matrix} of shape (n_samples,n_features)
            Training vectors,where `n_samples` is the number of samples and
            `n_features` is the number of features.
        y : array-like of shape (n_samples,)
            Target values.
        sample_weight : array-like of shape (n_samples,) or default=None
            Sample weights. If None,then samples are equally weighted.
            Note that this is supported only if all underlying estimators
            support sample weights.
            .. versionchanged:: 0.23
               when not None,`sample_weight` is passed to all underlying
               estimators

        Returns
        -------
        self : object
        """
        # all_estimators contains all estimators,the one to be fitted and the
        # 'drop' string.
        names,all_estimators = self._validate_estimators()
        self._validate_final_estimator()

        stack_method = [self.stack_method] * len(all_estimators)

        # Fit the base estimators on the whole training data. Those
        # base estimators will be used in transform,predict,and
        # predict_proba. They are exposed publicly.
        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(new_fit_single_estimator)(clone(est),sample_weight)
            for est in all_estimators if est != 'drop'
        )

        self.named_estimators_ = Bunch()
        est_fitted_idx = 0
        for name_est,org_est in zip(names,all_estimators):
            if org_est != 'drop':
                self.named_estimators_[name_est] = self.estimators_[
                    est_fitted_idx]
                est_fitted_idx += 1
            else:
                self.named_estimators_[name_est] = 'drop'

        # To train the meta-classifier using the most data as possible,we use
        # a cross-validation to obtain the output of the stacked estimators.

        # To ensure that the data provided to each estimator are the same,we
        # need to set the random state of the cv if there is one and we need to
        # take a copy.
        cv = check_cv(self.cv,y=y,classifier=is_classifier(self))
        if hasattr(cv,'random_state') and cv.random_state is None:
            cv.random_state = np.random.RandomState()

        self.stack_method_ = [
            self._method_name(name,est,meth)
            for name,meth in zip(names,all_estimators,stack_method)
        ]
        fit_params = ({f"{ESTIMATOR_NAME_IN_PIPELINE}__sample_weight": sample_weight}
                      if sample_weight is not None
                      else None)
        predictions = Parallel(n_jobs=self.n_jobs)(
            delayed(cross_val_predict)(clone(est),cv=deepcopy(cv),method=meth,n_jobs=self.n_jobs,fit_params=fit_params,verbose=self.verbose)
            for est,meth in zip(all_estimators,self.stack_method_)
            if est != 'drop'
        )

        # Only not None or not 'drop' estimators will be used in transform.
        # Remove the None from the method as well.
        self.stack_method_ = [
            meth for (meth,est) in zip(self.stack_method_,all_estimators)
            if est != 'drop'
        ]

        X_meta = self._concatenate_predictions(X,predictions)
        new_fit_single_estimator(self.final_estimator_,X_meta,sample_weight=sample_weight)

        return self


class FlexibleStackingRegressor(StackingRegressor):

    def __init__(self,sample_weight=sample_weight)

        return self

我包含了 Regressor 和 Classifier 版本,尽管您似乎只需要能够使用 Classifier 子类。

但有一点警告:您必须在管道中为您的估算器指定相同的名称,并且该名称必须与下面定义的 ESTIMATOR_NAME_IN_PIPELINE 字段保持一致。否则代码将无法工作。例如,这里将是一个适当定义的 Pipeline 实例,其名称与上面显示的类定义脚本中定义的名称相同:

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import TweedieRegressor
from sklearn.feature_selection import VarianceThreshold

validly_named_pipeline = Pipeline([
    ('variance_threshold',VarianceThreshold()),('scaler',StandardScaler()),('estimator',TweedieRegressor())
])

这并不理想,但这是我现在所拥有的,无论如何都应该工作。

编辑: 明确地说,当我覆盖 fit() 方法时,我只是从 scikit 存储库中复制并粘贴了代码并进行了必要的更改,其中仅包含几个线。粘贴的代码很多都不是我的原创,而是scikit开发者的。

,

对于您的情况,由于您有一个嵌套的管道,以下是您在传递参数时必须使用的键。

list(stack_pipe.get_params().keys())

['memory','steps','verbose','stackingclassifier','stackingclassifier__cv','stackingclassifier__estimators','stackingclassifier__final_estimator__objective','stackingclassifier__final_estimator__use_label_encoder','stackingclassifier__final_estimator__base_score','stackingclassifier__final_estimator__booster','stackingclassifier__final_estimator__colsample_bylevel','stackingclassifier__final_estimator__colsample_bynode','stackingclassifier__final_estimator__colsample_bytree','stackingclassifier__final_estimator__gamma','stackingclassifier__final_estimator__gpu_id','stackingclassifier__final_estimator__importance_type','stackingclassifier__final_estimator__interaction_constraints','stackingclassifier__final_estimator__learning_rate','stackingclassifier__final_estimator__max_delta_step','stackingclassifier__final_estimator__max_depth','stackingclassifier__final_estimator__min_child_weight','stackingclassifier__final_estimator__missing','stackingclassifier__final_estimator__monotone_constraints','stackingclassifier__final_estimator__n_estimators','stackingclassifier__final_estimator__n_jobs','stackingclassifier__final_estimator__num_parallel_tree','stackingclassifier__final_estimator__random_state','stackingclassifier__final_estimator__reg_alpha','stackingclassifier__final_estimator__reg_lambda','stackingclassifier__final_estimator__scale_pos_weight','stackingclassifier__final_estimator__subsample','stackingclassifier__final_estimator__tree_method','stackingclassifier__final_estimator__validate_parameters','stackingclassifier__final_estimator__verbosity','stackingclassifier__final_estimator','stackingclassifier__n_jobs','stackingclassifier__passthrough','stackingclassifier__stack_method','stackingclassifier__verbose','stackingclassifier__lr','stackingclassifier__lr__memory','stackingclassifier__lr__steps','stackingclassifier__lr__verbose','stackingclassifier__lr__standardscaler','stackingclassifier__lr__logisticregression','stackingclassifier__lr__standardscaler__copy','stackingclassifier__lr__standardscaler__with_mean','stackingclassifier__lr__standardscaler__with_std','stackingclassifier__lr__logisticregression__C','stackingclassifier__lr__logisticregression__class_weight','stackingclassifier__lr__logisticregression__dual','stackingclassifier__lr__logisticregression__fit_intercept','stackingclassifier__lr__logisticregression__intercept_scaling','stackingclassifier__lr__logisticregression__l1_ratio','stackingclassifier__lr__logisticregression__max_iter','stackingclassifier__lr__logisticregression__multi_class','stackingclassifier__lr__logisticregression__n_jobs','stackingclassifier__lr__logisticregression__penalty','stackingclassifier__lr__logisticregression__random_state','stackingclassifier__lr__logisticregression__solver','stackingclassifier__lr__logisticregression__tol','stackingclassifier__lr__logisticregression__verbose','stackingclassifier__lr__logisticregression__warm_start']

如果仔细观察,sample_weight 中没有 final_estimator 键。您可能需要检查原始 API,看看它是否已折旧或重命名。

版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。

相关推荐


依赖报错 idea导入项目后依赖报错,解决方案:https://blog.csdn.net/weixin_42420249/article/details/81191861 依赖版本报错:更换其他版本 无法下载依赖可参考:https://blog.csdn.net/weixin_42628809/a
错误1:代码生成器依赖和mybatis依赖冲突 启动项目时报错如下 2021-12-03 13:33:33.927 ERROR 7228 [ main] o.s.b.d.LoggingFailureAnalysisReporter : *************************** APPL
错误1:gradle项目控制台输出为乱码 # 解决方案:https://blog.csdn.net/weixin_43501566/article/details/112482302 # 在gradle-wrapper.properties 添加以下内容 org.gradle.jvmargs=-Df
错误还原:在查询的过程中,传入的workType为0时,该条件不起作用 <select id="xxx"> SELECT di.id, di.name, di.work_type, di.updated... <where> <if test=&qu
报错如下,gcc版本太低 ^ server.c:5346:31: 错误:‘struct redisServer’没有名为‘server_cpulist’的成员 redisSetCpuAffinity(server.server_cpulist); ^ server.c: 在函数‘hasActiveC
解决方案1 1、改项目中.idea/workspace.xml配置文件,增加dynamic.classpath参数 2、搜索PropertiesComponent,添加如下 <property name="dynamic.classpath" value="tru
删除根组件app.vue中的默认代码后报错:Module Error (from ./node_modules/eslint-loader/index.js): 解决方案:关闭ESlint代码检测,在项目根目录创建vue.config.js,在文件中添加 module.exports = { lin
查看spark默认的python版本 [root@master day27]# pyspark /home/software/spark-2.3.4-bin-hadoop2.7/conf/spark-env.sh: line 2: /usr/local/hadoop/bin/hadoop: No s
使用本地python环境可以成功执行 import pandas as pd import matplotlib.pyplot as plt # 设置字体 plt.rcParams['font.sans-serif'] = ['SimHei'] # 能正确显示负号 p
错误1:Request method ‘DELETE‘ not supported 错误还原:controller层有一个接口,访问该接口时报错:Request method ‘DELETE‘ not supported 错误原因:没有接收到前端传入的参数,修改为如下 参考 错误2:cannot r
错误1:启动docker镜像时报错:Error response from daemon: driver failed programming external connectivity on endpoint quirky_allen 解决方法:重启docker -> systemctl r
错误1:private field ‘xxx‘ is never assigned 按Altʾnter快捷键,选择第2项 参考:https://blog.csdn.net/shi_hong_fei_hei/article/details/88814070 错误2:启动时报错,不能找到主启动类 #
报错如下,通过源不能下载,最后警告pip需升级版本 Requirement already satisfied: pip in c:\users\ychen\appdata\local\programs\python\python310\lib\site-packages (22.0.4) Coll
错误1:maven打包报错 错误还原:使用maven打包项目时报错如下 [ERROR] Failed to execute goal org.apache.maven.plugins:maven-resources-plugin:3.2.0:resources (default-resources)
错误1:服务调用时报错 服务消费者模块assess通过openFeign调用服务提供者模块hires 如下为服务提供者模块hires的控制层接口 @RestController @RequestMapping("/hires") public class FeignControl
错误1:运行项目后报如下错误 解决方案 报错2:Failed to execute goal org.apache.maven.plugins:maven-compiler-plugin:3.8.1:compile (default-compile) on project sb 解决方案:在pom.
参考 错误原因 过滤器或拦截器在生效时,redisTemplate还没有注入 解决方案:在注入容器时就生效 @Component //项目运行时就注入Spring容器 public class RedisBean { @Resource private RedisTemplate<String
使用vite构建项目报错 C:\Users\ychen\work>npm init @vitejs/app @vitejs/create-app is deprecated, use npm init vite instead C:\Users\ychen\AppData\Local\npm-