slope 0.31.0
Loading...
Searching...
No Matches
cv.h
Go to the documentation of this file.
1
11#pragma once
12
13#include "folds.h"
14#include "score.h"
15#include "slope.h"
16#include <vector>
17
18#ifdef _OPENMP
19#include <omp.h>
20#endif
21
22namespace slope {
23
31{
35 Eigen::MatrixXd score;
36
38 std::map<std::string, double> params;
39
41 Eigen::ArrayXd alphas;
42
44 Eigen::ArrayXd mean_scores;
45
48 Eigen::ArrayXd std_errors;
49};
50
59{
62 std::vector<GridResult> results;
63
66 std::map<std::string, double> best_params;
67
69 double best_score;
70
73
77};
78
87{
89 int n_folds = 10;
90
92 int n_repeats = 1;
93
95 std::string metric = "mse";
96
98 uint64_t random_seed = 42;
99
101 std::map<std::string, std::vector<double>> hyperparams;
102
104 std::map<std::string, std::vector<double>> default_hyperparams = {
105 { "q", { 0.1 } },
106 { "gamma", { 0.0 } },
107 };
108
110 std::optional<std::vector<std::vector<std::vector<int>>>> predefined_folds;
111};
112
124std::vector<std::map<std::string, double>>
125createGrid(const std::map<std::string, std::vector<double>>& param_values);
126
137void
138findBestParameters(CvResult& cv_result, const std::unique_ptr<Score>& scorer);
139
164template<typename MatrixType>
167 MatrixType& x,
168 const Eigen::MatrixXd& y_in,
169 const CvConfig& config = CvConfig())
170{
171 CvResult cv_result;
172
173 int n = y_in.rows();
174
175 auto loss = setupLoss(model.getLossType());
176
177 auto y = loss->preprocessResponse(y_in);
178 auto scorer = Score::create(config.metric);
179
180 auto hyperparams = config.default_hyperparams;
181
182 // Override with user-specified parameters
183 for (const auto& [key, values] : config.hyperparams) {
184 hyperparams[key] = values;
185 }
186
187 auto grid = createGrid(hyperparams);
188
189 // Total number of evaluations (n_repeats * n_folds)
190 Folds folds =
191 config.predefined_folds.has_value()
192 ? Folds(config.predefined_folds.value())
193 : Folds(n, config.n_folds, config.n_repeats, config.random_seed);
194
195 int n_evals = folds.numEvals();
196
197 for (const auto& params : grid) {
198 GridResult result;
199 result.params = params;
200
201 double q = params.at("q");
202 double gamma = params.at("gamma");
203
204 model.setQ(q);
205
206 auto initial_path = model.path(x, y);
207
208 result.alphas = initial_path.getAlpha();
209 int n_alpha = result.alphas.size();
210
211 assert((result.alphas > 0).all());
212
213 Eigen::MatrixXd scores = Eigen::MatrixXd::Zero(n_evals, n_alpha);
214
215 Eigen::setNbThreads(1);
216
217 // Thread-safety for exceptions
218 std::vector<std::string> thread_errors(n_evals);
219 bool had_exception = false;
220
221#ifdef _OPENMP
222 omp_set_max_active_levels(1);
223#pragma omp parallel for num_threads(Threads::get()) \
224 shared(scores, thread_errors, had_exception)
225#endif
226 for (int i = 0; i < n_evals; ++i) {
227 try {
228 auto [rep, fold] = std::div(i, folds.numFolds());
229
230 Slope thread_model = model;
231 thread_model.setModifyX(true);
232
233 // TODO: Maybe consider not copying at all?
234 auto [x_train, y_train, x_test, y_test] = folds.split(x, y, fold, rep);
235
236 auto path = thread_model.path(x_train, y_train, result.alphas);
237
238 if (gamma > 0) {
239 path = thread_model.relax(path, x_train, y_train, gamma);
240 }
241
242 for (int j = 0; j < n_alpha; ++j) {
243 auto eta = path(j).predict(x_test, "linear");
244 scores(i, j) = scorer->eval(eta, y_test, loss);
245 }
246 } catch (const std::exception& e) {
247 thread_errors[i] = e.what();
248#ifdef _OPENMP
249#pragma omp atomic write
250#endif
251 had_exception = true;
252 } catch (...) {
253 thread_errors[i] = "Unknown exception";
254#ifdef _OPENMP
255#pragma omp atomic write
256#endif
257 had_exception = true;
258 }
259 }
260
261 if (had_exception) {
262 std::string error_message = "Exception(s) during cross-validation:\n";
263 for (int i = 0; i < n_evals; ++i) {
264 if (!thread_errors[i].empty()) {
265 error_message +=
266 "Fold " + std::to_string(i) + ": " + thread_errors[i] + "\n";
267 }
268 }
269 throw std::runtime_error(error_message);
270 }
271
272 result.mean_scores = scores.colwise().mean();
273 result.std_errors = stdDevs(scores).array() / std::sqrt(n_evals);
274 result.score = std::move(scores);
275 cv_result.results.push_back(result);
276 }
277
278#ifdef _OPENMP
279 Eigen::setNbThreads(0);
280#endif
281
282 findBestParameters(cv_result, scorer);
283
284 return cv_result;
285}
286
287} // namespace slope
Manages data partitioning for cross-validation.
Definition folds.h:26
size_t numEvals() const
Get the total number of folds (repetitions * folds)
Definition folds.h:160
size_t numFolds() const
Get the number of folds.
Definition folds.h:146
std::tuple< MatrixType, Eigen::MatrixXd, MatrixType, Eigen::MatrixXd > split(MatrixType &x, const Eigen::MatrixXd &y, size_t fold_idx, size_t rep_idx=0) const
Split data into training and test sets for a specific fold and repetition.
Definition folds.h:123
static std::unique_ptr< Score > create(const std::string &metric)
Definition score.cpp:182
Eigen::MatrixXd predict(T &x, const std::string &type="response") const
Predict the response for a given input matrix.
Definition slope_fit.h:237
The SLOPE model.
Definition slope.h:33
SlopePath path(T &x, const Eigen::MatrixXd &y_in, Eigen::ArrayXd alpha=Eigen::ArrayXd::Zero(0), Eigen::ArrayXd lambda=Eigen::ArrayXd::Zero(0))
Computes SLOPE regression solution path for multiple alpha and lambda values.
Definition slope.cpp:27
const std::string & getLossType()
Get currently defined loss type.
Definition slope.cpp:614
SlopeFit relax(const SlopeFit &fit, T &x, const Eigen::VectorXd &y_in, const double gamma=0.0, Eigen::VectorXd beta0=Eigen::VectorXd(0), Eigen::VectorXd beta=Eigen::VectorXd(0))
Relaxes a fitted SLOPE model.
Definition slope.h:373
void setModifyX(const bool modify_x)
Controls if x should be modified-in-place.
Definition slope.cpp:554
void setQ(double q)
Sets the q value.
Definition slope.cpp:441
Cross-validation fold management for SLOPE models.
Namespace containing SLOPE regression implementation.
Definition clusters.cpp:5
std::unique_ptr< Loss > setupLoss(const std::string &loss)
Factory function to create the appropriate loss function based on the distribution family.
void findBestParameters(CvResult &cv_result, const std::unique_ptr< Score > &scorer)
Identifies the best parameters from cross-validation results.
Definition cv.cpp:39
Eigen::VectorXd stdDevs(const Eigen::SparseMatrix< double > &x)
Computes the standard deviation for each column of a matrix.
Definition math.cpp:180
std::vector< std::map< std::string, double > > createGrid(const std::map< std::string, std::vector< double > > &param_values)
Creates a grid of parameter combinations from parameter value ranges.
Definition cv.cpp:6
CvResult crossValidate(Slope model, MatrixType &x, const Eigen::MatrixXd &y_in, const CvConfig &config=CvConfig())
Performs cross-validation on a SLOPE model to select optimal hyperparameters.
Definition cv.h:166
Scoring metrics for model evaluation.
SLOPE (Sorted L-One Penalized Estimation) optimization.
Configuration settings for cross-validation.
Definition cv.h:87
int n_repeats
Number of times to repeat the cross-validation (default: 1)
Definition cv.h:92
std::map< std::string, std::vector< double > > hyperparams
Map of hyperparameter names to vectors of values to evaluate.
Definition cv.h:101
std::optional< std::vector< std::vector< std::vector< int > > > > predefined_folds
Optional user-defined fold assignments for custom cross-validation splits.
Definition cv.h:110
int n_folds
Number of folds for cross-validation (default: 10)
Definition cv.h:89
std::string metric
Evaluation metric used for model assessment (default: "mse")
Definition cv.h:95
std::map< std::string, std::vector< double > > default_hyperparams
Map of hyperparameter names to vectors of values to evaluate.
Definition cv.h:104
uint64_t random_seed
Seed for random number generator to ensure reproducibility (default: 42)
Definition cv.h:98
Contains overall results from a cross-validation process.
Definition cv.h:59
double best_score
The score achieved by the optimal hyperparameter configuration.
Definition cv.h:69
std::map< std::string, double > best_params
Definition cv.h:66
std::vector< GridResult > results
Definition cv.h:62
int best_ind
Index of the best performing configuration in the results vector.
Definition cv.h:72
int best_alpha_ind
Definition cv.h:76
Stores cross-validation results for a specific set of hyperparameters.
Definition cv.h:31
Eigen::MatrixXd score
Definition cv.h:35
Eigen::ArrayXd mean_scores
Array of scores averaged across all folds for each alpha value.
Definition cv.h:44
std::map< std::string, double > params
Map of hyperparameter names to their values for the configuration.
Definition cv.h:38
Eigen::ArrayXd std_errors
Definition cv.h:48
Eigen::ArrayXd alphas
Array of regularization parameters used in the regularization path.
Definition cv.h:41