Coverage for src/sensai/util/pandas.py: 57%

49 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-08-13 22:17 +0000

1import logging 

2from copy import copy 

3 

4import numpy as np 

5import pandas as pd 

6 

7log = logging.getLogger(__name__) 

8 

9 

10class DataFrameColumnChangeTracker: 

11 """ 

12 A simple class for keeping track of changes in columns between an initial data frame and some other data frame 

13 (usually the result of some transformations performed on the initial one). 

14 

15 Example: 

16 

17 >>> from sensai.util.pandas import DataFrameColumnChangeTracker 

18 >>> import pandas as pd 

19 

20 >>> df = pd.DataFrame({"bar": [1, 2]}) 

21 >>> columnChangeTracker = DataFrameColumnChangeTracker(df) 

22 >>> df["foo"] = [4, 5] 

23 >>> columnChangeTracker.track_change(df) 

24 >>> columnChangeTracker.get_removed_columns() 

25 set() 

26 >>> columnChangeTracker.get_added_columns() 

27 {'foo'} 

28 """ 

29 def __init__(self, initial_df: pd.DataFrame): 

30 self.initialColumns = copy(initial_df.columns) 

31 self.final_columns = None 

32 

33 def track_change(self, changed_df: pd.DataFrame): 

34 self.final_columns = copy(changed_df.columns) 

35 

36 def get_removed_columns(self): 

37 self.assert_change_was_tracked() 

38 return set(self.initialColumns).difference(self.final_columns) 

39 

40 def get_added_columns(self): 

41 """ 

42 Returns the columns in the last entry of the history that were not present the first one 

43 """ 

44 self.assert_change_was_tracked() 

45 return set(self.final_columns).difference(self.initialColumns) 

46 

47 def column_change_string(self): 

48 """ 

49 Returns a string representation of the change 

50 """ 

51 self.assert_change_was_tracked() 

52 if list(self.initialColumns) == list(self.final_columns): 

53 return "none" 

54 removed_cols, added_cols = self.get_removed_columns(), self.get_added_columns() 

55 if removed_cols == added_cols == set(): 

56 return f"reordered {list(self.final_columns)}" 

57 

58 return f"added={list(added_cols)}, removed={list(removed_cols)}" 

59 

60 def assert_change_was_tracked(self): 

61 if self.final_columns is None: 

62 raise Exception(f"No change was tracked yet. " 

63 f"Did you forget to call trackChange on the resulting data frame?") 

64 

65 

66def extract_array(df: pd.DataFrame, dtype=None): 

67 """ 

68 Extracts array from data frame. It is expected that each row corresponds to a data point and 

69 each column corresponds to a "channel". Moreover, all entries are expected to be arrays of the same shape 

70 (or scalars or sequences of the same length). We will refer to that shape as tensorShape. 

71 

72 The output will be of shape `(N_rows, N_columns, *tensorShape)`. Thus, `N_rows` can be interpreted as dataset length 

73 (or batch size, if a single batch is passed) and N_columns can be interpreted as number of channels. 

74 Empty dimensions will be stripped, thus if the data frame has only one column, the array will have shape 

75 `(N_rows, *tensorShape)`. 

76 E.g. an image with three channels could equally be passed as data frame of the type 

77 

78 

79 +------------------+------------------+------------------+ 

80 | R | G | B | 

81 +==================+==================+==================+ 

82 | channel | channel | channel | 

83 +------------------+------------------+------------------+ 

84 | channel | channel | channel | 

85 +------------------+------------------+------------------+ 

86 | ... | ... | ... | 

87 +------------------+------------------+------------------+ 

88 

89 or as data frame of type 

90 

91 +------------------+ 

92 | image | 

93 +==================+ 

94 | RGB-array | 

95 +------------------+ 

96 | RGB-array | 

97 +------------------+ 

98 | ... | 

99 +------------------+ 

100 

101 In both cases the returned array will have shape `(N_images, 3, width, height)` 

102 

103 :param df: data frame where each entry is an array of shape tensorShape 

104 :param dtype: if not None, convert the array's data type to this type (string or numpy dtype) 

105 :return: array of shape `(N_rows, N_columns, *tensorShape)` with stripped empty dimensions 

106 """ 

107 log.debug(f"Stacking tensors of shape {np.array(df.iloc[0, 0]).shape}") 

108 try: 

109 # This compact way of extracting the array causes dtypes to be modified, 

110 # arr = np.stack(df.apply(np.stack, axis=1)).squeeze() 

111 # so we use this numpy-only alternative: 

112 arr = df.values 

113 if arr.shape[1] > 1: 

114 arr = np.stack([np.stack(arr[i]) for i in range(arr.shape[0])]) 

115 else: 

116 arr = np.stack(arr[:, 0]) 

117 # For the case where there is only one row, the old implementation above removed the first dimension, 

118 # so we do the same, even though it seems odd to do so (potential problem for batch size 1) 

119 # TODO: remove this behavior 

120 if arr.shape[0] == 1: 

121 arr = arr[0] 

122 except ValueError: 

123 raise ValueError(f"No array can be extracted from frame of length {len(df)} with columns {list(df.columns)}. " 

124 f"Make sure that all entries have the same shape") 

125 if dtype is not None: 

126 arr = arr.astype(dtype, copy=False) 

127 return arr 

128 

129 

130def remove_duplicate_index_entries(df: pd.DataFrame): 

131 """ 

132 Removes successive duplicate index entries by keeping only the first occurrence for every duplicate index element. 

133 

134 :param df: the data frame, which is assumed to have a sorted index 

135 :return: the (modified) data frame with duplicate index entries removed 

136 """ 

137 keep = [True] 

138 prev_item = df.index[0] 

139 for item in df.index[1:]: 

140 keep.append(item != prev_item) 

141 prev_item = item 

142 return df[keep]