-
-
Notifications
You must be signed in to change notification settings - Fork 27
Add Row-Wise Mode Functionality (axis=1) and Improve Metadata Handling in _collection.py #1137
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
4dd4ff0
e976f38
bc340ee
9f918ac
ddaef07
f8b6463
894bd9c
1dbdd2d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3650,13 +3650,46 @@ def query(self, expr, **kwargs): | |
| return new_collection(Query(self, expr, kwargs)) | ||
|
|
||
| @derived_from(pd.DataFrame) | ||
| def mode(self, dropna=True, split_every=False, numeric_only=False): | ||
| modes = [] | ||
| for _, col in self.items(): | ||
| if numeric_only and not pd.api.types.is_numeric_dtype(col.dtype): | ||
| continue | ||
| modes.append(col.mode(dropna=dropna, split_every=split_every)) | ||
| return concat(modes, axis=1) | ||
| # GH#11389 - Dask issue related to adding row-wise mode functionality | ||
| # GH#1136 - Dask-Expr specific implementation for row-wise mode functionality | ||
| # Contributor: @thyripian | ||
| def mode(self, axis=0, numeric_only=False, dropna=True, split_every=False): | ||
| if axis == 0: | ||
| # Existing logic for axis=0 (column-wise mode) | ||
| modes = [] | ||
| for _, col in self.items(): | ||
| if numeric_only and not pd.api.types.is_numeric_dtype(col.dtype): | ||
| continue | ||
| modes.append(col.mode(dropna=dropna, split_every=split_every)) | ||
| return concat(modes, axis=1) | ||
| elif axis == 1: | ||
| # Implement axis=1 (row-wise mode) | ||
| num_columns = len(self.columns) # Maximum possible number of modes per row | ||
|
|
||
| def row_wise_mode(df): | ||
| result = df.mode(axis=1, numeric_only=numeric_only, dropna=dropna) | ||
| # Ensure consistent number of columns across all partitions | ||
| if result.shape[1] < num_columns: | ||
| # Pad with NaN columns | ||
| for i in range(result.shape[1], num_columns): | ||
| result[i] = np.nan | ||
| elif result.shape[1] > num_columns: | ||
| # Trim extra columns | ||
| result = result.iloc[:, :num_columns] | ||
| # Reindex columns to ensure consistent order | ||
| result = result.reindex(columns=range(num_columns)) | ||
| # Set column data types to float64 to accommodate NaN values | ||
| result = result.astype("float64") | ||
| return result | ||
|
|
||
| # Create metadata with the correct number of columns and float64 dtype | ||
| meta = pd.DataFrame( | ||
| {i: pd.Series(dtype="float64") for i in range(num_columns)} | ||
|
||
| ) | ||
|
|
||
| return self.map_partitions(row_wise_mode, meta=meta) | ||
| else: | ||
| raise ValueError(f"No axis named {axis} for object type {type(self)}") | ||
|
|
||
| @derived_from(pd.DataFrame) | ||
| def add_prefix(self, prefix): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you don't have to do this, we generally expect that the columns in every partition are matching
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you clarify which part isn't necessary? Are you referring to determining the number of columns for row-wise mode?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Everything that is in def row_wise_mode basically
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Understood. I'll make the necessary revisions this afternoon and push them to this PR once tested.