slope 6.2.1
Loading...
Searching...
No Matches
cv.h
Go to the documentation of this file.
1
11#pragma once
12
13#include "eigen_compat.h"
14#include "folds.h"
15#include "score.h"
16#include "slope.h"
17#include <vector>
18
19#ifdef _OPENMP
20#include <omp.h>
21#endif
22
23namespace slope {
24
25using slope::all;
26
34{
38 Eigen::MatrixXd score;
39
41 std::map<std::string, double> params;
42
44 Eigen::ArrayXd alphas;
45
47 Eigen::ArrayXd mean_scores;
48
51 Eigen::ArrayXd std_errors;
52};
53
62{
65 std::vector<GridResult> results;
66
69 std::map<std::string, double> best_params;
70
72 double best_score;
73
76
80};
81
90{
92 int n_folds = 10;
93
95 int n_repeats = 1;
96
98 std::string metric = "mse";
99
101 uint64_t random_seed = 42;
102
104 bool copy_x = true;
105
107 std::map<std::string, std::vector<double>> hyperparams;
108
110 std::map<std::string, std::vector<double>> default_hyperparams = {
111 { "q", { 0.1 } },
112 { "gamma", { 0.0 } },
113 };
114
116 std::optional<std::vector<std::vector<std::vector<int>>>> predefined_folds;
117};
118
119namespace detail {
120
121std::vector<std::map<std::string, double>>
122createGrid(const std::map<std::string, std::vector<double>>& param_values);
123
124void
125findBestParameters(CvResult& cv_result, const std::unique_ptr<Score>& scorer);
126
127template<typename T>
128Eigen::ArrayXd
129fitToFold(Eigen::MatrixBase<T>& x,
130 const Eigen::MatrixXd& y,
131 const Folds& folds,
132 const std::unique_ptr<Loss>& loss,
133 const std::unique_ptr<Score>& scorer,
134 const Eigen::ArrayXd& alphas,
135 Slope& thread_model,
136 const int fold,
137 const int rep,
138 const double gamma = 0.0,
139 const bool copy_x = true)
140{
141 Eigen::ArrayXd scores = Eigen::ArrayXd::Zero(alphas.size());
142
143 if (copy_x) {
144 thread_model.setModifyX(true);
145
146 auto [x_train, y_train, x_test, y_test] = folds.split(x, y, fold, rep);
147
148 auto path = thread_model.path(x_train, y_train, alphas);
149
150 if (gamma > 0) {
151 path = thread_model.relax(path, x_train, y_train, gamma);
152 }
153
154 for (int j = 0; j < path.size(); ++j) {
155 auto eta = path(j).predict(x_test, "linear");
156 scores(j) = scorer->eval(eta, y_test, loss);
157 }
158
159 } else {
160 thread_model.setModifyX(false);
161
162 auto train_idx = folds.getTrainingIndices(fold, rep);
163 auto test_idx = folds.getTestIndices(fold, rep);
164
165 // Create views
166 auto x_train = x(train_idx, all);
167 auto x_test = x(test_idx, all);
168
169 Eigen::MatrixXd y_train = y(train_idx, all);
170 Eigen::MatrixXd y_test = y(test_idx, all);
171
172 auto path = thread_model.path(x_train, y_train, alphas);
173
174 if (gamma > 0) {
175 path = thread_model.relax(path, x_train, y_train, gamma);
176 }
177
178 for (int j = 0; j < path.size(); ++j) {
179 auto eta = path(j).predict(x_test, "linear");
180 scores(j) = scorer->eval(eta, y_test, loss);
181 }
182 }
183
184 return scores;
185}
186
187template<typename T>
188Eigen::ArrayXd
189fitToFold(Eigen::SparseMatrixBase<T>& x,
190 const Eigen::MatrixXd& y,
191 const Folds& folds,
192 const std::unique_ptr<Loss>& loss,
193 const std::unique_ptr<Score>& scorer,
194 const Eigen::ArrayXd& alphas,
195 Slope& thread_model,
196 const int fold,
197 const int rep,
198 const double gamma = 0.0,
199 const bool copy_x = true)
200{
201 thread_model.setModifyX(true);
202
203 auto [x_train, y_train, x_test, y_test] = folds.split(x, y, fold, rep);
204
205 auto path = thread_model.path(x_train, y_train, alphas);
206
207 if (gamma > 0) {
208 path = thread_model.relax(path, x_train, y_train, gamma);
209 }
210
211 Eigen::ArrayXd scores = Eigen::ArrayXd::Zero(path.size());
212
213 for (int j = 0; j < path.size(); ++j) {
214 auto eta = path(j).predict(x_test, "linear");
215 scores(j) = scorer->eval(eta, y_test, loss);
216 }
217
218 return scores;
219}
220
221} // namespace detail
222
246template<typename T>
247CvResult
249 Eigen::EigenBase<T>& x,
250 const Eigen::MatrixXd& y_in,
251 const CvConfig& config = CvConfig())
252{
253 CvResult cv_result;
254
255 int n = y_in.rows();
256
257 auto loss = setupLoss(model.getLossType());
258
259 auto y = loss->preprocessResponse(y_in);
260 auto scorer = Score::create(config.metric);
261
262 auto hyperparams = config.default_hyperparams;
263
264 // Override with user-specified parameters
265 for (const auto& [key, values] : config.hyperparams) {
266 hyperparams[key] = values;
267 }
268
269 auto grid = detail::createGrid(hyperparams);
270
271 // Total number of evaluations (n_repeats * n_folds)
272 Folds folds =
273 config.predefined_folds.has_value()
274 ? Folds(*config.predefined_folds)
275 : Folds(n, config.n_folds, config.n_repeats, config.random_seed);
276
277 int n_evals = folds.numEvals();
278
279 for (const auto& params : grid) {
280 GridResult result;
281 result.params = params;
282
283 double q = params.at("q");
284 double gamma = params.at("gamma");
285
286 model.setQ(q);
287
288 auto initial_path = model.path(x, y);
289
290 result.alphas = initial_path.getAlpha();
291 int n_alpha = result.alphas.size();
292
293 assert((result.alphas > 0).all());
294
295 Eigen::MatrixXd scores = Eigen::MatrixXd::Zero(n_evals, n_alpha);
296
297#ifdef _OPENMP
298 Eigen::setNbThreads(1);
299#endif
300
301 // Thread-safety for exceptions
302 std::vector<std::string> thread_errors(n_evals);
303 bool had_exception = false;
304
305#ifdef _OPENMP
306 omp_set_max_active_levels(1);
307#pragma omp parallel for num_threads(Threads::get()) \
308 shared(scores, thread_errors, had_exception)
309#endif
310 for (int i = 0; i < n_evals; ++i) {
311 try {
312 auto [rep, fold] = std::div(i, folds.numFolds());
313
314 Slope thread_model = model;
315
316 scores.row(i) = detail::fitToFold(x.derived(),
317 y,
318 folds,
319 loss,
320 scorer,
321 result.alphas,
322 thread_model,
323 fold,
324 rep,
325 gamma,
326 config.copy_x);
327
328 } catch (const std::exception& e) {
329 thread_errors[i] = e.what();
330#ifdef _OPENMP
331#pragma omp atomic write
332#endif
333 had_exception = true;
334 } catch (...) {
335 thread_errors[i] = "Unknown exception";
336#ifdef _OPENMP
337#pragma omp atomic write
338#endif
339 had_exception = true;
340 }
341 }
342
343 if (had_exception) {
344 std::string error_message = "Exception(s) during cross-validation:\n";
345 for (int i = 0; i < n_evals; ++i) {
346 if (!thread_errors[i].empty()) {
347 error_message +=
348 "Fold " + std::to_string(i) + ": " + thread_errors[i] + "\n";
349 }
350 }
351 throw std::runtime_error(error_message);
352 }
353
354 result.mean_scores = scores.colwise().mean();
355 result.std_errors = stdDevs(scores).array() / std::sqrt(n_evals);
356 result.score = std::move(scores);
357 cv_result.results.push_back(result);
358 }
359
360#ifdef _OPENMP
361 Eigen::setNbThreads(0);
362#endif
363
364 detail::findBestParameters(cv_result, scorer);
365
366 return cv_result;
367}
368
369} // namespace slope
Manages data partitioning for cross-validation.
Definition folds.h:29
size_t numEvals() const
Get the total number of folds (repetitions * folds)
Definition folds.h:166
size_t numFolds() const
Get the number of folds.
Definition folds.h:152
std::vector< int > getTrainingIndices(size_t fold_idx, size_t rep_idx=0) const
Get training indices for a specific fold and repetition.
const std::vector< int > & getTestIndices(size_t fold_idx, size_t rep_idx=0) const
Get test indices for a specific fold and repetition.
auto split(Eigen::EigenBase< T > &x, const Eigen::MatrixXd &y, size_t fold_idx, size_t rep_idx=0) const
Split data into training and test sets for a specific fold and repetition.
Definition folds.h:130
static std::unique_ptr< Score > create(const std::string &metric)
Eigen::MatrixXd predict(Eigen::EigenBase< T > &x, const std::string &type="response") const
Predict the response for a given input matrix.
Definition slope_fit.h:237
The SLOPE model.
Definition slope.h:56
SlopePath path(Eigen::EigenBase< T > &x, const Eigen::MatrixXd &y_in, Eigen::ArrayXd alpha=Eigen::ArrayXd::Zero(0), Eigen::ArrayXd lambda=Eigen::ArrayXd::Zero(0), std::function< bool()> check_interrupt=defaultInterruptChecker)
Computes SLOPE regression solution path for multiple alpha and lambda values.
Definition slope.h:388
const std::string & getLossType()
Get currently defined loss type.
SlopeFit relax(const SlopeFit &fit, T &x, const Eigen::VectorXd &y_in, const double gamma=0.0, Eigen::VectorXd beta0=Eigen::VectorXd(0), Eigen::VectorXd beta=Eigen::VectorXd(0))
Relaxes a fitted SLOPE model.
Definition slope.h:862
void setModifyX(const bool modify_x)
Controls if x should be modified-in-place.
void setQ(double q)
Sets the q value.
Eigen compatibility layer for version differences.
Cross-validation fold management for SLOPE models.
Namespace containing SLOPE regression implementation.
Definition clusters.h:11
std::unique_ptr< Loss > setupLoss(const std::string &loss)
Factory function to create the appropriate loss function based on the distribution family.
CvResult crossValidate(Slope model, Eigen::EigenBase< T > &x, const Eigen::MatrixXd &y_in, const CvConfig &config=CvConfig())
Performs cross-validation on a SLOPE model to select optimal hyperparameters.
Definition cv.h:248
Eigen::VectorXd stdDevs(const Eigen::SparseMatrixBase< T > &x)
Computes the standard deviation for each column of a matrix.
Definition math.h:618
Scoring metrics for model evaluation.
SLOPE (Sorted L-One Penalized Estimation) optimization.
Configuration settings for cross-validation.
Definition cv.h:90
int n_repeats
Number of times to repeat the cross-validation (default: 1)
Definition cv.h:95
bool copy_x
Whether to copy the design matrix for each fold (default: true)
Definition cv.h:104
std::map< std::string, std::vector< double > > hyperparams
Map of hyperparameter names to vectors of values to evaluate.
Definition cv.h:107
std::optional< std::vector< std::vector< std::vector< int > > > > predefined_folds
Optional user-defined fold assignments for custom cross-validation splits.
Definition cv.h:116
int n_folds
Number of folds for cross-validation (default: 10)
Definition cv.h:92
std::string metric
Evaluation metric used for model assessment (default: "mse")
Definition cv.h:98
std::map< std::string, std::vector< double > > default_hyperparams
Map of hyperparameter names to vectors of values to evaluate.
Definition cv.h:110
uint64_t random_seed
Seed for random number generator to ensure reproducibility (default: 42)
Definition cv.h:101
Contains overall results from a cross-validation process.
Definition cv.h:62
double best_score
The score achieved by the optimal hyperparameter configuration.
Definition cv.h:72
std::map< std::string, double > best_params
Definition cv.h:69
std::vector< GridResult > results
Definition cv.h:65
int best_ind
Index of the best performing configuration in the results vector.
Definition cv.h:75
int best_alpha_ind
Definition cv.h:79
Stores cross-validation results for a specific set of hyperparameters.
Definition cv.h:34
Eigen::MatrixXd score
Definition cv.h:38
Eigen::ArrayXd mean_scores
Array of scores averaged across all folds for each alpha value.
Definition cv.h:47
std::map< std::string, double > params
Map of hyperparameter names to their values for the configuration.
Definition cv.h:41
Eigen::ArrayXd std_errors
Definition cv.h:51
Eigen::ArrayXd alphas
Array of regularization parameters used in the regularization path.
Definition cv.h:44