Skip to content

Commit 64041cc

Browse files
committed
Add Skewness and Kurtosis actions to RDataFrame using Welford's algorithm
1 parent 0033360 commit 64041cc

File tree

3 files changed

+266
-1
lines changed

3 files changed

+266
-1
lines changed

tree/dataframe/inc/ROOT/RDF/ActionHelpers.hxx

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
#include <utility> // std::index_sequence
4545
#include <vector>
4646
#include <numeric> // std::accumulate in MeanHelper
47+
#include <cmath>
4748

4849
class TCollection;
4950
class TStatistic;
@@ -1316,6 +1317,193 @@ public:
13161317
}
13171318
};
13181319

1320+
// Implements Welford's Online Algorithm for Skewness
1321+
class R__CLING_PTRCHECK(off) SkewnessHelper : public RActionImpl<SkewnessHelper> {
1322+
unsigned int fNSlots;
1323+
std::shared_ptr<double> fResultSkewness;
1324+
1325+
// Accumulators per slot
1326+
std::vector<ULong64_t> fCounts;
1327+
std::vector<double> fMeans;
1328+
std::vector<double> fM2; // Sum of squares of differences
1329+
std::vector<double> fM3; // Sum of cubed differences
1330+
1331+
public:
1332+
SkewnessHelper(const std::shared_ptr<double> &resPtr, const unsigned int nSlots)
1333+
: fNSlots(nSlots),
1334+
fResultSkewness(resPtr),
1335+
fCounts(fNSlots, 0),
1336+
fMeans(fNSlots, 0.),
1337+
fM2(fNSlots, 0.),
1338+
fM3(fNSlots, 0.)
1339+
{
1340+
}
1341+
1342+
SkewnessHelper(SkewnessHelper &&) = default;
1343+
SkewnessHelper(const SkewnessHelper &) = delete;
1344+
1345+
void InitTask(TTreeReader *, unsigned int) {}
1346+
1347+
void Exec(unsigned int slot, double val)
1348+
{
1349+
ULong64_t n = ++fCounts[slot];
1350+
double delta = val - fMeans[slot];
1351+
double delta_n = delta / n;
1352+
double term1 = delta * delta_n * (n - 1);
1353+
1354+
fM3[slot] += term1 * delta_n * (n - 2) - 3 * delta_n * fM2[slot];
1355+
fM2[slot] += term1;
1356+
fMeans[slot] += delta_n;
1357+
}
1358+
1359+
template <typename T, std::enable_if_t<IsDataContainer<T>::value, int> = 0>
1360+
void Exec(unsigned int slot, const T &vs)
1361+
{
1362+
for (auto &&v : vs) {
1363+
Exec(slot, v);
1364+
}
1365+
}
1366+
1367+
void Initialize() { /* noop */ }
1368+
1369+
void Finalize()
1370+
{
1371+
// Merge all slots into slot 0 using Chan et al. parallel algorithm
1372+
for (unsigned int i = 1; i < fNSlots; ++i) {
1373+
if (fCounts[i] == 0)
1374+
continue;
1375+
1376+
ULong64_t n1 = fCounts[0];
1377+
ULong64_t n2 = fCounts[i];
1378+
ULong64_t n = n1 + n2;
1379+
1380+
double delta = fMeans[i] - fMeans[0];
1381+
double delta2 = delta * delta;
1382+
double delta3 = delta * delta2;
1383+
1384+
fM3[0] += fM3[i] + delta3 * n1 * n2 * (n1 - n2) / (n * n) + 3.0 * delta * (n1 * fM2[i] - n2 * fM2[0]) / n;
1385+
fM2[0] += fM2[i] + delta2 * n1 * n2 / n;
1386+
fMeans[0] += delta * n2 / n;
1387+
fCounts[0] = n;
1388+
}
1389+
1390+
if (fCounts[0] > 2 && fM2[0] > 0) {
1391+
// Cast required to resolve ambiguity with RVec math wrappers
1392+
*fResultSkewness = (std::sqrt(static_cast<double>(fCounts[0])) * fM3[0]) / std::pow(fM2[0], 1.5);
1393+
} else {
1394+
*fResultSkewness = 0.0;
1395+
}
1396+
}
1397+
1398+
std::string GetActionName() { return "Skewness"; }
1399+
1400+
SkewnessHelper MakeNew(void *newResult, std::string_view /*variation*/ = "nominal")
1401+
{
1402+
auto &result = *static_cast<std::shared_ptr<double> *>(newResult);
1403+
return SkewnessHelper(result, fCounts.size());
1404+
}
1405+
1406+
std::unique_ptr<RMergeableValueBase> GetMergeableValue() const final { return nullptr; }
1407+
};
1408+
1409+
// Implements Welford's Online Algorithm for Kurtosis
1410+
class R__CLING_PTRCHECK(off) KurtosisHelper : public RActionImpl<KurtosisHelper> {
1411+
unsigned int fNSlots;
1412+
std::shared_ptr<double> fResultKurtosis;
1413+
1414+
// Accumulators per slot
1415+
std::vector<ULong64_t> fCounts;
1416+
std::vector<double> fMeans;
1417+
std::vector<double> fM2;
1418+
std::vector<double> fM3;
1419+
std::vector<double> fM4;
1420+
1421+
public:
1422+
KurtosisHelper(const std::shared_ptr<double> &resPtr, const unsigned int nSlots)
1423+
: fNSlots(nSlots),
1424+
fResultKurtosis(resPtr),
1425+
fCounts(fNSlots, 0),
1426+
fMeans(fNSlots, 0.),
1427+
fM2(fNSlots, 0.),
1428+
fM3(fNSlots, 0.),
1429+
fM4(fNSlots, 0.)
1430+
{
1431+
}
1432+
1433+
KurtosisHelper(KurtosisHelper &&) = default;
1434+
KurtosisHelper(const KurtosisHelper &) = delete;
1435+
1436+
void InitTask(TTreeReader *, unsigned int) {}
1437+
1438+
void Exec(unsigned int slot, double val)
1439+
{
1440+
ULong64_t n = ++fCounts[slot];
1441+
double delta = val - fMeans[slot];
1442+
double delta_n = delta / n;
1443+
double term1 = delta * delta_n * (n - 1);
1444+
1445+
fM4[slot] +=
1446+
term1 * delta_n * delta_n * (n * n - 3 * n + 3) + 6 * delta_n * delta_n * fM2[slot] - 4 * delta_n * fM3[slot];
1447+
fM3[slot] += term1 * delta_n * (n - 2) - 3 * delta_n * fM2[slot];
1448+
fM2[slot] += term1;
1449+
fMeans[slot] += delta_n;
1450+
}
1451+
1452+
template <typename T, std::enable_if_t<IsDataContainer<T>::value, int> = 0>
1453+
void Exec(unsigned int slot, const T &vs)
1454+
{
1455+
for (auto &&v : vs) {
1456+
Exec(slot, v);
1457+
}
1458+
}
1459+
1460+
void Initialize() { /* noop */ }
1461+
1462+
void Finalize()
1463+
{
1464+
for (unsigned int i = 1; i < fNSlots; ++i) {
1465+
if (fCounts[i] == 0)
1466+
continue;
1467+
1468+
ULong64_t n1 = fCounts[0];
1469+
ULong64_t n2 = fCounts[i];
1470+
ULong64_t n = n1 + n2;
1471+
1472+
double delta = fMeans[i] - fMeans[0];
1473+
double delta2 = delta * delta;
1474+
double delta3 = delta * delta2;
1475+
double delta4 = delta2 * delta2;
1476+
1477+
fM4[0] += fM4[i] + delta4 * n1 * n2 * (n1 * n1 - n1 * n2 + n2 * n2) / (n * n * n) +
1478+
6.0 * delta2 * (n1 * n1 * fM2[i] + n2 * n2 * fM2[0]) / (n * n) +
1479+
4.0 * delta * (n1 * fM3[i] - n2 * fM3[0]) / n;
1480+
1481+
fM3[0] += fM3[i] + delta3 * n1 * n2 * (n1 - n2) / (n * n) + 3.0 * delta * (n1 * fM2[i] - n2 * fM2[0]) / n;
1482+
fM2[0] += fM2[i] + delta2 * n1 * n2 / n;
1483+
fMeans[0] += delta * n2 / n;
1484+
fCounts[0] = n;
1485+
}
1486+
1487+
if (fCounts[0] > 3 && fM2[0] > 0) {
1488+
// Calculate Excess Kurtosis: (N*M4) / (M2^2) - 3
1489+
double n = static_cast<double>(fCounts[0]);
1490+
*fResultKurtosis = (n * fM4[0]) / (fM2[0] * fM2[0]) - 3.0;
1491+
} else {
1492+
*fResultKurtosis = -3.0;
1493+
}
1494+
}
1495+
1496+
std::string GetActionName() { return "Kurtosis"; }
1497+
1498+
KurtosisHelper MakeNew(void *newResult, std::string_view /*variation*/ = "nominal")
1499+
{
1500+
auto &result = *static_cast<std::shared_ptr<double> *>(newResult);
1501+
return KurtosisHelper(result, fCounts.size());
1502+
}
1503+
1504+
std::unique_ptr<RMergeableValueBase> GetMergeableValue() const final { return nullptr; }
1505+
};
1506+
13191507
template <typename PrevNodeType>
13201508
class R__CLING_PTRCHECK(off) DisplayHelper : public RActionImpl<DisplayHelper<PrevNodeType>> {
13211509
private:

tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,8 @@ struct Sum{};
100100
struct Mean{};
101101
struct Fill{};
102102
struct StdDev{};
103+
struct Skewness{};
104+
struct Kurtosis{};
103105
struct Display{};
104106
struct Snapshot{};
105107
struct Book{};
@@ -246,6 +248,28 @@ BuildAction(const ColumnNames_t &bl, const std::shared_ptr<double> &stdDeviation
246248
return std::make_unique<Action_t>(Helper_t(stdDeviationV, nSlots), bl, prevNode, colRegister);
247249
}
248250

251+
// Skewness action
252+
template <typename ColType, typename PrevNodeType>
253+
std::unique_ptr<RActionBase>
254+
BuildAction(const ColumnNames_t &bl, const std::shared_ptr<double> &skewnessV, const unsigned int nSlots,
255+
std::shared_ptr<PrevNodeType> prevNode, ActionTags::Skewness, const RColumnRegister &colRegister)
256+
{
257+
using Helper_t = SkewnessHelper;
258+
using Action_t = RAction<Helper_t, PrevNodeType, TTraits::TypeList<ColType>>;
259+
return std::make_unique<Action_t>(Helper_t(skewnessV, nSlots), bl, prevNode, colRegister);
260+
}
261+
262+
// Kurtosis action
263+
template <typename ColType, typename PrevNodeType>
264+
std::unique_ptr<RActionBase>
265+
BuildAction(const ColumnNames_t &bl, const std::shared_ptr<double> &kurtosisV, const unsigned int nSlots,
266+
std::shared_ptr<PrevNodeType> prevNode, ActionTags::Kurtosis, const RColumnRegister &colRegister)
267+
{
268+
using Helper_t = KurtosisHelper;
269+
using Action_t = RAction<Helper_t, PrevNodeType, TTraits::TypeList<ColType>>;
270+
return std::make_unique<Action_t>(Helper_t(kurtosisV, nSlots), bl, prevNode, colRegister);
271+
}
272+
249273
using displayHelperArgs_t = std::pair<size_t, std::shared_ptr<ROOT::RDF::RDisplay>>;
250274

251275
// Display action

tree/dataframe/inc/ROOT/RDF/RInterface.hxx

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2906,7 +2906,60 @@ public:
29062906
return CreateAction<RDFInternal::ActionTags::StdDev, T>(userColumns, stdDeviationV, stdDeviationV, fProxiedPtr);
29072907
}
29082908

2909-
// clang-format off
2909+
////////////////////////////////////////////////////////////////////////////
2910+
/// \brief Return the skewness of processed column values (*lazy action*).
2911+
/// \tparam T The type of the branch/column.
2912+
/// \param[in] columnName The name of the branch/column to be treated.
2913+
/// \return the skewness of the selected column wrapped in a RResultPtr.
2914+
///
2915+
/// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
2916+
/// template specialization of this method.
2917+
///
2918+
/// This action is *lazy*: upon invocation of this method the calculation is
2919+
/// booked but not executed. Also see RResultPtr.
2920+
///
2921+
/// ### Example usage:
2922+
/// ~~~{.cpp}
2923+
/// // Deduce column type (this invocation needs jitting internally)
2924+
/// auto skew = df.Skewness("values");
2925+
/// ~~~
2926+
///
2927+
/// \ingroup dataframe_actions
2928+
template <typename T = RDFDetail::RInferredType>
2929+
RResultPtr<double> Skewness(std::string_view columnName = "")
2930+
{
2931+
const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
2932+
auto skewnessV = std::make_shared<double>(0);
2933+
return CreateAction<RDFInternal::ActionTags::Skewness, T>(userColumns, skewnessV, skewnessV, fProxiedPtr);
2934+
}
2935+
2936+
////////////////////////////////////////////////////////////////////////////
2937+
/// \brief Return the excess kurtosis of processed column values (*lazy action*).
2938+
/// \tparam T The type of the branch/column.
2939+
/// \param[in] columnName The name of the branch/column to be treated.
2940+
/// \return the excess kurtosis of the selected column wrapped in a RResultPtr.
2941+
///
2942+
/// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
2943+
/// template specialization of this method.
2944+
///
2945+
/// This action is *lazy*: upon invocation of this method the calculation is
2946+
/// booked but not executed. Also see RResultPtr.
2947+
///
2948+
/// ### Example usage:
2949+
/// ~~~{.cpp}
2950+
/// // Deduce column type (this invocation needs jitting internally)
2951+
/// auto kurt = df.Kurtosis("values");
2952+
/// ~~~
2953+
///
2954+
/// \ingroup dataframe_actions
2955+
template <typename T = RDFDetail::RInferredType>
2956+
RResultPtr<double> Kurtosis(std::string_view columnName = "")
2957+
{
2958+
const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
2959+
auto kurtosisV = std::make_shared<double>(0);
2960+
return CreateAction<RDFInternal::ActionTags::Kurtosis, T>(userColumns, kurtosisV, kurtosisV, fProxiedPtr);
2961+
}
2962+
29102963
////////////////////////////////////////////////////////////////////////////
29112964
/// \brief Return the sum of processed column values (*lazy action*).
29122965
/// \tparam T The type of the branch/column.

0 commit comments

Comments
 (0)