cpp-toolbox  0.0.1
A toolbox library for C++
Loading...
Searching...
No Matches
statistics.hpp
Go to the documentation of this file.
1#pragma once
2
3#include <algorithm>
4#include <cmath>
5#include <functional>
6#include <iterator>
7#include <map>
8#include <numeric>
9#include <stdexcept>
10#include <string>
11#include <type_traits>
12#include <utility>
13#include <vector>
14
16
17namespace toolbox::math
18{
19
36template<typename TContainer>
37void check_empty(const TContainer& data, const char* func_name)
38{
39 if (data.empty()) {
40 std::string err_msg = "Input container for '";
41 err_msg += func_name;
42 err_msg += "' cannot be empty.";
43 throw std::invalid_argument(err_msg);
44 }
45}
46
63template<typename TContainer>
64double mean(const TContainer& data)
65{
66 using ValueType = typename TContainer::value_type;
67 static_assert(std::is_arithmetic_v<ValueType>,
68 "Mean requires arithmetic type elements.");
69 check_empty(data, "mean");
70
71 double sum = 0.0;
72 for (const auto& val : data) {
73 sum += static_cast<double>(val); // 使用 double 进行累加保证精度/Use double
74 // for accumulation to ensure precision
75 }
76 return sum / data.size();
77}
78
94template<typename TContainer>
95double median(const TContainer& data)
96{
97 using ValueType = typename TContainer::value_type;
98 static_assert(std::is_arithmetic_v<ValueType>,
99 "Median requires arithmetic type elements for calculation.");
100 static_assert(toolbox::traits::is_less_than_comparable_v<ValueType>,
101 "Median requires comparable elements for sorting.");
102 check_empty(data, "median");
103
104 std::vector<ValueType> sorted_data(
105 data.begin(),
106 data.end()); // 复制数据以便排序/Copy data for sorting
107 std::sort(sorted_data.begin(), sorted_data.end());
108
109 size_t n = sorted_data.size();
110 if (n % 2 != 0) {
111 // 奇数个元素/Odd number of elements
112 return static_cast<double>(sorted_data[n / 2]);
113 } else {
114 // 偶数个元素/Even number of elements
115 return static_cast<double>(sorted_data[n / 2 - 1] + sorted_data[n / 2])
116 / 2.0;
117 }
118}
119
137template<typename TContainer>
138std::vector<typename TContainer::value_type> mode(const TContainer& data)
139{
140 using ValueType = typename TContainer::value_type;
141 static_assert(
142 toolbox::traits::is_less_than_comparable_v<ValueType>,
143 "Mode requires element type to be comparable (for std::map keys).");
144 // 允许空容器的众数为空集合/Allow mode of empty container to be empty set
145 if (data.empty()) {
146 return {};
147 }
148
149 std::map<ValueType, size_t> counts;
150 for (const auto& val : data) {
151 counts[val]++;
152 }
153
154 size_t max_freq = 0;
155 for (const auto& pair : counts) {
156 if (pair.second > max_freq) {
157 max_freq = pair.second;
158 }
159 }
160
161 std::vector<ValueType> modes;
162 if (max_freq > 0) { // 确保至少有一个元素/Ensure at least one element
163 for (const auto& pair : counts) {
164 if (pair.second == max_freq) {
165 modes.push_back(pair.first);
166 }
167 }
168 // 为了结果的一致性,对众数进行排序 (如果它们本身是可比较的)/Sort modes for
169 // consistency (if comparable) 如果 ValueType 不支持排序,但作为 map key
170 // 必须支持,这里 sort 应该也没问题/If ValueType is not sortable, but as map
171 // key it must be, so sort is fine here
172 std::sort(modes.begin(), modes.end());
173 }
174 return modes;
175}
176
200template<typename TContainer>
201double variance(const TContainer& data, bool sample_variance = true)
202{
203 using ValueType = typename TContainer::value_type;
204 static_assert(std::is_arithmetic_v<ValueType>,
205 "Variance requires arithmetic type elements.");
206
207 size_t n = data.size();
208 const char* func_name_str =
209 sample_variance ? "sample variance" : "population variance";
210
211 if (sample_variance) {
212 if (n < 2) {
213 std::string err_msg =
214 "Sample variance requires at least 2 data points. Container for '";
215 err_msg += func_name_str;
216 err_msg += "' is too small.";
217 throw std::invalid_argument(err_msg);
218 }
219 } else { // population variance/总体方差
220 check_empty(data, func_name_str); // 总体方差至少需要1个元素/Population
221 // variance requires at least 1 element
222 }
223 // mean() 内部会调用 check_empty/mean() will call check_empty internally
224 double m = mean(data);
225 double sum_sq_diff = 0.0;
226 for (const auto& val : data) {
227 sum_sq_diff += std::pow(static_cast<double>(val) - m, 2);
228 }
229
230 if (sample_variance) {
231 return sum_sq_diff / (n - 1);
232 } else {
233 return sum_sq_diff / n;
234 }
235}
236
260template<typename TContainer>
261double stdev(const TContainer& data, bool sample_stdev = true)
262{
263 using ValueType = typename TContainer::value_type;
264 static_assert(std::is_arithmetic_v<ValueType>,
265 "Standard deviation requires arithmetic type elements.");
266 // variance 函数会处理大小检查和空容器情况/variance will handle size and empty
267 // checks
268 return std::sqrt(variance(data, sample_stdev));
269}
270
287template<typename TContainer>
288typename TContainer::value_type sum(const TContainer& data)
289{
290 using ValueType = typename TContainer::value_type;
291 static_assert(std::is_arithmetic_v<ValueType>,
292 "Sum requires arithmetic type elements.");
293 if (data.empty()) {
294 return static_cast<ValueType>(0); // 空集合的和为0/Sum of empty set is 0
295 }
296 // 注意:对于较小整数类型和大量数据,这里可能溢出。考虑使用 sum_d。/Note: For
297 // small integer types and large data, may overflow. Consider using sum_d.
298 return std::accumulate(data.begin(), data.end(), static_cast<ValueType>(0));
299}
300
317template<typename TContainer>
318double sum_d(const TContainer& data)
319{
320 using ValueType = typename TContainer::value_type;
321 static_assert(std::is_arithmetic_v<ValueType>,
322 "Sum (double) requires arithmetic type elements.");
323 if (data.empty()) {
324 return 0.0;
325 }
326 double current_sum = 0.0;
327 for (const auto& val : data) {
328 current_sum += static_cast<double>(val);
329 }
330 return current_sum;
331}
332
348template<typename TContainer>
349typename TContainer::value_type min(const TContainer& data)
350{
351 using ValueType = typename TContainer::value_type;
352 static_assert(toolbox::traits::is_less_than_comparable_v<ValueType>,
353 "min requires comparable elements.");
354 check_empty(data, "min");
355 return *std::min_element(data.begin(), data.end());
356}
357
373template<typename TContainer>
374typename TContainer::value_type max(const TContainer& data)
375{
376 using ValueType = typename TContainer::value_type;
377 // std::max_element 默认使用 operator</std::max_element uses operator< by
378 // default
379 static_assert(toolbox::traits::is_less_than_comparable_v<ValueType>,
380 "max requires comparable elements.");
381 check_empty(data, "max");
382 return *std::max_element(data.begin(), data.end());
383}
384
401template<typename TContainer>
402typename TContainer::value_type range(const TContainer& data)
403{
404 using ValueType = typename TContainer::value_type;
405 static_assert(std::is_arithmetic_v<ValueType>,
406 "Range requires arithmetic type elements for subtraction.");
407 static_assert(toolbox::traits::is_less_than_comparable_v<ValueType>,
408 "Range requires comparable elements to find min/max.");
409 check_empty(data, "range"); // min 和 max 内部也会检查/min and max will also
410 // check internally
411
412 // C++17 中 std::minmax_element 已经可用/std::minmax_element is available in
413 // C++17
414 auto [min_it, max_it] = std::minmax_element(data.begin(), data.end());
415 return static_cast<ValueType>(*max_it - *min_it);
416}
417
437template<typename TContainer>
438double percentile(const TContainer& data, double p)
439{
440 using ValueType = typename TContainer::value_type;
441 static_assert(
442 std::is_arithmetic_v<ValueType>,
443 "Percentile requires arithmetic type elements for calculation.");
444 static_assert(toolbox::traits::is_less_than_comparable_v<ValueType>,
445 "Percentile requires comparable elements for sorting.");
446 check_empty(data, "percentile");
447
448 if (p < 0.0 || p > 1.0) {
449 throw std::out_of_range(
450 "Percentile p must be between 0.0 and 1.0 inclusive.");
451 }
452
453 std::vector<ValueType> sorted_data(data.begin(), data.end());
454 std::sort(sorted_data.begin(), sorted_data.end());
455
456 // Avoid direct floating point equality comparison which triggers
457 // -Wfloat-equal when building with strict warnings. Treat values very
458 // close to the bounds as the bounds themselves.
459 if (p <= 0.0)
460 return static_cast<double>(sorted_data.front());
461 if (p >= 1.0)
462 return static_cast<double>(sorted_data.back());
463
464 // 线性插值法 (R-7 in R, type 7 in NumPy, "Excel.Exclusive" in Excel)/Linear
465 // interpolation (R-7 in R, type 7 in NumPy, "Excel.Exclusive" in Excel)
466 // (N-1)p gives 0-based index./(N-1)p gives 0-based index.
467 double index_double = p * (static_cast<double>(sorted_data.size()) - 1.0);
468 size_t lower_index = static_cast<size_t>(std::floor(index_double));
469 size_t upper_index = static_cast<size_t>(std::ceil(index_double));
470
471 if (lower_index == upper_index) { // 索引是整数/Index is integer
472 return static_cast<double>(sorted_data[lower_index]);
473 } else { // 在两个值之间插值/Interpolate between two values
474 double lower_val = static_cast<double>(sorted_data[lower_index]);
475 double upper_val = static_cast<double>(sorted_data[upper_index]);
476 return lower_val + (index_double - lower_index) * (upper_val - lower_val);
477 }
478}
479
500template<typename TContainer>
501std::vector<typename TContainer::value_type> min_k(const TContainer& data,
502 size_t k)
503{
504 using ValueType = typename TContainer::value_type;
505 static_assert(
506 toolbox::traits::is_less_than_comparable_v<ValueType>,
507 "min_k requires elements comparable with operator< for sorting.");
508
509 if (k == 0) {
510 return {};
511 }
513 data,
514 "min_k"); // 如果 k > 0, data 不应为空/If k > 0, data should not be empty
515
516 std::vector<ValueType> result;
517 if (k >= data.size()) { // 如果 k
518 // 大于或等于容器大小,返回所有元素排序后的结果/If k
519 // >= container size, return all sorted
520 result.assign(data.begin(), data.end());
521 std::sort(result.begin(), result.end());
522 } else {
523 result.resize(k); // 必须先调整大小/Must resize first
524 std::partial_sort_copy(data.begin(),
525 data.end(), // 从 data 复制并部分排序到 result/Copy
526 // from data and partial sort to result
527 result.begin(),
528 result.end());
529 }
530 return result;
531}
532
553template<typename TContainer>
554std::vector<typename TContainer::value_type> max_k(const TContainer& data,
555 size_t k)
556{
557 using ValueType = typename TContainer::value_type;
558 // std::partial_sort_copy with std::greater
559 // 需要元素可比较/std::partial_sort_copy with std::greater requires comparable
560 // elements std::greater<T> 使用
561 // operator>,或者等价地,operator</std::greater<T> uses operator> or
562 // equivalently operator<
563 static_assert(toolbox::traits::is_greater_than_comparable_v<ValueType>
564 || toolbox::traits::is_less_than_comparable_v<ValueType>,
565 "max_k requires comparable elements (operator> or operator< "
566 "for std::greater).");
567
568 if (k == 0) {
569 return {};
570 }
571 check_empty(data, "max_k");
572
573 std::vector<ValueType> result;
574 if (k >= data.size()) {
575 result.assign(data.begin(), data.end());
576 std::sort(result.begin(),
577 result.end(),
578 std::greater<ValueType>()); // 使用 std::greater 进行降序排序/Use
579 // std::greater for descending sort
580 } else {
581 result.resize(k);
582 std::partial_sort_copy(
583 data.begin(),
584 data.end(),
585 result.begin(),
586 result.end(),
587 std::greater<ValueType>()); // 使用 std::greater 获取最大的k个/Use
588 // std::greater to get top k
589 }
590 return result;
591}
592
593} // namespace toolbox::math
Definition matrix.hpp:4
TContainer::value_type max(const TContainer &data)
查找容器中的最大值/Find the maximum value in the container
Definition statistics.hpp:374
std::vector< typename TContainer::value_type > max_k(const TContainer &data, size_t k)
返回容器中k个最大的元素,降序排列/Return the k largest elements in the container, sorted descending
Definition statistics.hpp:554
TContainer::value_type sum(const TContainer &data)
计算容器中元素的总和(返回容器元素类型)/Compute the sum of elements in the container (returns container element type)
Definition statistics.hpp:288
TContainer::value_type min(const TContainer &data)
查找容器中的最小值/Find the minimum value in the container
Definition statistics.hpp:349
std::vector< typename TContainer::value_type > mode(const TContainer &data)
计算容器中元素的众数(可能有多个)/Compute the mode(s) of elements in a container (may be multiple)
Definition statistics.hpp:138
std::vector< typename TContainer::value_type > min_k(const TContainer &data, size_t k)
返回容器中k个最小的元素,升序排列/Return the k smallest elements in the container, sorted ascending
Definition statistics.hpp:501
TContainer::value_type range(const TContainer &data)
计算容器中元素的全距(最大值-最小值)/Compute the range (max - min) of elements in the container
Definition statistics.hpp:402
void check_empty(const TContainer &data, const char *func_name)
检查容器是否为空并抛出异常/Check if the container is empty and throw an exception
Definition statistics.hpp:37
double percentile(const TContainer &data, double p)
计算百分位数(使用最近邻等级法和线性插值)/Compute the percentile (using nearest-rank and linear interpolation)
Definition statistics.hpp:438
double sum_d(const TContainer &data)
计算容器中元素的总和(返回 double 类型以保证精度和范围)/Compute the sum of elements in the container (returns double for pre...
Definition statistics.hpp:318
double median(const TContainer &data)
计算容器中元素的中位数/Compute the median of elements in a container
Definition statistics.hpp:95
double variance(const TContainer &data, bool sample_variance=true)
计算方差(默认样本方差 N-1)/Compute the variance (default is sample variance N-1)
Definition statistics.hpp:201
double mean(const TContainer &data)
计算容器中元素的平均值/Compute the mean (average) of elements in a container
Definition statistics.hpp:64
double stdev(const TContainer &data, bool sample_stdev=true)
计算标准差(默认样本标准差)/Compute the standard deviation (default is sample standard deviation)
Definition statistics.hpp:261
类型特征工具集合/Type traits utilities collection