Coverage for src/sensai/util/pandas.py: 57%
49 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-08-13 22:17 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-08-13 22:17 +0000
1import logging
2from copy import copy
4import numpy as np
5import pandas as pd
7log = logging.getLogger(__name__)
10class DataFrameColumnChangeTracker:
11 """
12 A simple class for keeping track of changes in columns between an initial data frame and some other data frame
13 (usually the result of some transformations performed on the initial one).
15 Example:
17 >>> from sensai.util.pandas import DataFrameColumnChangeTracker
18 >>> import pandas as pd
20 >>> df = pd.DataFrame({"bar": [1, 2]})
21 >>> columnChangeTracker = DataFrameColumnChangeTracker(df)
22 >>> df["foo"] = [4, 5]
23 >>> columnChangeTracker.track_change(df)
24 >>> columnChangeTracker.get_removed_columns()
25 set()
26 >>> columnChangeTracker.get_added_columns()
27 {'foo'}
28 """
29 def __init__(self, initial_df: pd.DataFrame):
30 self.initialColumns = copy(initial_df.columns)
31 self.final_columns = None
33 def track_change(self, changed_df: pd.DataFrame):
34 self.final_columns = copy(changed_df.columns)
36 def get_removed_columns(self):
37 self.assert_change_was_tracked()
38 return set(self.initialColumns).difference(self.final_columns)
40 def get_added_columns(self):
41 """
42 Returns the columns in the last entry of the history that were not present the first one
43 """
44 self.assert_change_was_tracked()
45 return set(self.final_columns).difference(self.initialColumns)
47 def column_change_string(self):
48 """
49 Returns a string representation of the change
50 """
51 self.assert_change_was_tracked()
52 if list(self.initialColumns) == list(self.final_columns):
53 return "none"
54 removed_cols, added_cols = self.get_removed_columns(), self.get_added_columns()
55 if removed_cols == added_cols == set():
56 return f"reordered {list(self.final_columns)}"
58 return f"added={list(added_cols)}, removed={list(removed_cols)}"
60 def assert_change_was_tracked(self):
61 if self.final_columns is None:
62 raise Exception(f"No change was tracked yet. "
63 f"Did you forget to call trackChange on the resulting data frame?")
66def extract_array(df: pd.DataFrame, dtype=None):
67 """
68 Extracts array from data frame. It is expected that each row corresponds to a data point and
69 each column corresponds to a "channel". Moreover, all entries are expected to be arrays of the same shape
70 (or scalars or sequences of the same length). We will refer to that shape as tensorShape.
72 The output will be of shape `(N_rows, N_columns, *tensorShape)`. Thus, `N_rows` can be interpreted as dataset length
73 (or batch size, if a single batch is passed) and N_columns can be interpreted as number of channels.
74 Empty dimensions will be stripped, thus if the data frame has only one column, the array will have shape
75 `(N_rows, *tensorShape)`.
76 E.g. an image with three channels could equally be passed as data frame of the type
79 +------------------+------------------+------------------+
80 | R | G | B |
81 +==================+==================+==================+
82 | channel | channel | channel |
83 +------------------+------------------+------------------+
84 | channel | channel | channel |
85 +------------------+------------------+------------------+
86 | ... | ... | ... |
87 +------------------+------------------+------------------+
89 or as data frame of type
91 +------------------+
92 | image |
93 +==================+
94 | RGB-array |
95 +------------------+
96 | RGB-array |
97 +------------------+
98 | ... |
99 +------------------+
101 In both cases the returned array will have shape `(N_images, 3, width, height)`
103 :param df: data frame where each entry is an array of shape tensorShape
104 :param dtype: if not None, convert the array's data type to this type (string or numpy dtype)
105 :return: array of shape `(N_rows, N_columns, *tensorShape)` with stripped empty dimensions
106 """
107 log.debug(f"Stacking tensors of shape {np.array(df.iloc[0, 0]).shape}")
108 try:
109 # This compact way of extracting the array causes dtypes to be modified,
110 # arr = np.stack(df.apply(np.stack, axis=1)).squeeze()
111 # so we use this numpy-only alternative:
112 arr = df.values
113 if arr.shape[1] > 1:
114 arr = np.stack([np.stack(arr[i]) for i in range(arr.shape[0])])
115 else:
116 arr = np.stack(arr[:, 0])
117 # For the case where there is only one row, the old implementation above removed the first dimension,
118 # so we do the same, even though it seems odd to do so (potential problem for batch size 1)
119 # TODO: remove this behavior
120 if arr.shape[0] == 1:
121 arr = arr[0]
122 except ValueError:
123 raise ValueError(f"No array can be extracted from frame of length {len(df)} with columns {list(df.columns)}. "
124 f"Make sure that all entries have the same shape")
125 if dtype is not None:
126 arr = arr.astype(dtype, copy=False)
127 return arr
130def remove_duplicate_index_entries(df: pd.DataFrame):
131 """
132 Removes successive duplicate index entries by keeping only the first occurrence for every duplicate index element.
134 :param df: the data frame, which is assumed to have a sorted index
135 :return: the (modified) data frame with duplicate index entries removed
136 """
137 keep = [True]
138 prev_item = df.index[0]
139 for item in df.index[1:]:
140 keep.append(item != prev_item)
141 prev_item = item
142 return df[keep]