Coverage for src/sensai/util/cache

1from __future__ import annotations

2import collections

3import functools

4import pickle

5import sys

6from abc import ABC, abstractmethod

7from concurrent.futures.thread import ThreadPoolExecutor

8from typing import Callable, Dict, Union, Any, List, Sequence, Optional

9import json

10import logging

11import re

12import threading

15from azure.storage.table import TableService, TableBatch, Entity

16from azure.storage.blob import BlockBlobService

17import pandas as pd

18import numpy as np

20from .cache import PersistentKeyValueCache, PeriodicUpdateHook

22_log = logging.getLogger(__name__)

25class Serialiser(ABC):

26 """

27 Abstraction for mechanisms to serialise values, which do not fit table storage data model,

28 see https://docs.microsoft.com/en-us/rest/api/storageservices/understanding-the-table-service-data-model

29 """

31 @abstractmethod

32 def serialise(self, value) -> str:

33 pass

35 @abstractmethod

36 def deserialise(self, value: str):

37 pass

40class NumpyArrayJsonSerialiser(Serialiser):

41 """

42 Serialises a numpy array as json string of list representation of array

43 """

45 def serialise(self, value: np.ndarray) -> str:

46 return json.dumps(value.tolist())

48 def deserialise(self, value: str):

49 return np.array(json.loads(value))

52class PropertyLoader(ABC):

53 """

54 Abstraction of a customised loader for an entity property

55 """

57 @abstractmethod

58 def load_property_value(self, entity: Entity):

59 pass

61 @abstractmethod

62 def write_property_value(self, entity: Entity):

63 pass

65 @abstractmethod

66 def load_property_value_to_data_frame_column(self, df: pd.DataFrame):

67 pass

70class SerialisedPropertyLoader(PropertyLoader):

71 """

72 PropertyLoader to serialise and de-serialise values. Useful, if type of values is not aligned with table storage data model,

73 see https://docs.microsoft.com/en-us/rest/api/storageservices/understanding-the-table-service-data-model

74 """

75 def __init__(self, property_name: str, serialiser: Serialiser):

76 self.serialiser = serialiser

77 self.property_name = property_name

79 def load_property_value(self, entity: Entity):

80 entity[self.property_name] = self.serialiser.deserialise(entity[self.property_name])

82 def write_property_value(self, entity: Entity):

83 entity[self.property_name] = self.serialiser.serialise(entity[self.property_name])

85 def load_property_value_to_data_frame_column(self, df: pd.DataFrame):

86 if self.property_name in df.columns:

87 df.loc[:, self.property_name] = [self.serialiser.deserialise(value) for value in df[self.property_name]]

90class AzureTableBlobBackend(ABC):

91 """

92 Abstraction of a blob backend, which allows for convenient setting and getting of values stored in blob storage via a

93 reference to the value

94 """

96 @abstractmethod

97 def get_value_from_reference(self, value_identifier: str):

98 pass

100 @abstractmethod

101 def get_value_reference(self, partition_key: str, row_key: str, value_name: str, blob_name_prefix: str = None) -> str:

102 pass

103

104 @abstractmethod

105 def set_value_for_reference(self, value_identifier: str, value):

106 pass

107

108

109class BlobPerKeyAzureTableBlobBackend(AzureTableBlobBackend, ABC):

110

111 """

112 Backend stores serialised values as /tableName/partitionKey/rowKey/valueName.<fileExtension>

113 or /tableName/rowKey/valueName.<fileExtension>, if partitionKey equals tableName

114 """

115

116 def __init__(self, block_blob_service: BlockBlobService, container_name: str):

117 """

118

119 :param block_blob_service: https://docs.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blockblobservice.blockblobservice?view=azure-python-previous

120 """

121 self.block_blob_service = block_blob_service

122 self.container_name = container_name

123 self.container_list = [container.name for container in block_blob_service.list_containers()]

124 if container_name not in self.container_list:

125 self.block_blob_service.create_container(container_name)

126 self.container_list.append(container_name)

127

128 @property

129 @abstractmethod

130 def file_extension(self):

131 pass

132

133 @abstractmethod

134 def _get_blob_value(self, container_name, blob_name):

135 pass

136

137 @abstractmethod

138 def _write_value_to_blob(self, container_name, blob_name, value):

139 pass

140

141 def get_value_from_reference(self, value_identifier: str):

142 container_name = self._get_container_name_from_identifier(value_identifier)

143 blob_name = self._get_blob_name_from_identifier(value_identifier)

144 return self._get_blob_value(container_name, blob_name)

145

146 def get_value_reference(self, partition_key: str, row_key: str, value_name: str, blob_name_prefix: str = None) -> str:

147 blob_name = self._get_blob_name_from_keys(partition_key, row_key, value_name, blob_prefix=blob_name_prefix)

148 return self.block_blob_service.make_blob_url(self.container_name, blob_name)

149

150 def set_value_for_reference(self, value_identifier: str, value):

151 container_name = self._get_container_name_from_identifier(value_identifier)

152 blob_name = self._get_blob_name_from_identifier(value_identifier)

153 self._write_value_to_blob(container_name, blob_name, value)

154

155 def _get_blob_name_from_identifier(self, value_identifier: str):

156 return (value_identifier.partition(f"{self.block_blob_service.primary_endpoint}/")[2]).partition("/")[2]

157

158 def _get_container_name_from_identifier(self, value_identifier: str):

159 return (value_identifier.partition(f"{self.block_blob_service.primary_endpoint}/")[2]).partition("/")[0]

160

161 def _get_blob_name_from_keys(self, partition_key: str, row_key: str, value_name: str, blob_prefix: str = None):

162 identifier_list = [blob_prefix, partition_key] if blob_prefix is not None and blob_prefix != partition_key else [partition_key]

163 identifier_list.extend([row_key, value_name])

164 return "/".join(identifier_list) + self.file_extension

165

166

167class TextDumpAzureTableBlobBackend(BlobPerKeyAzureTableBlobBackend):

168 """

169 Backend stores values as txt files in the structure /tableName/partitionKey/rowKey/valueName

170 """

171

172 @property

173 def file_extension(self):

174 return ""

175

176 def _get_blob_value(self, container_name, blob_name):

177 return self.block_blob_service.get_blob_to_text(container_name, blob_name).content

178

179 def _write_value_to_blob(self, container_name, blob_name, value):

180 self.block_blob_service.create_blob_from_text(container_name, blob_name, value)

181

182

183class JsonAzureTableBlobBackend(BlobPerKeyAzureTableBlobBackend):

184 """

185 Backend stores values as json files in the structure /tableName/partitionKey/rowKey/valueName.json

186 """

187

188 @property

189 def file_extension(self):

190 return ".json"

191

192 def _get_blob_value(self, container_name, blob_name):

193 encoded_value = self.block_blob_service.get_blob_to_bytes(container_name, blob_name).content

194 return self._decode_bytes_to_value(encoded_value)

195

196 def _write_value_to_blob(self, container_name, blob_name, value):

197 encoded_value = self._encode_value_to_bytes(value)

198 self.block_blob_service.create_blob_from_bytes(container_name, blob_name, encoded_value)

199

200 @staticmethod

201 def _encode_value_to_bytes(value):

202 return str.encode(json.dumps(value))

203

204 @staticmethod

205 def _decode_bytes_to_value(_bytes):

206 return json.loads(_bytes.decode())

207

208

209class PickleAzureTableBlobBackend(JsonAzureTableBlobBackend):

210 """

211 Backend stores values as pickle files in the structure /tableName/partitionKey/rowKey/valueName.pickle

212 """

213

214 @property

215 def file_extension(self):

216 return ".pickle"

217

218 @staticmethod

219 def _encode_value_to_bytes(value):

220 return pickle.dumps(value)

221

222 @staticmethod

223 def _decode_bytes_to_value(_bytes):

224 return pickle.loads(_bytes)

225

226

227class BlobBackedPropertyLoader(PropertyLoader):

228 AZURE_ALLOWED_SIZE_PER_PROPERTY_BYTES = 64000

229 AZURE_ALLOWED_STRING_LENGTH_PER_PROPERTY = 32000

230

231 """

232 PropertyLoader to write and read values from blob backend via a reference to the value. Useful, if values cannot

233 be stored in table storage itself, due to not being aligned with table storage data model,

234 see https://docs.microsoft.com/en-us/rest/api/storageservices/understanding-the-table-service-data-model

235 """

236 def __init__(self, property_name: str, blob_backend: AzureTableBlobBackend, blob_prefix: str = None,

237 property_boolean_blob_status_name: str = None, max_workers=None):

238 """

239 :param property_name: name of property in table

240 :param property_boolean_blob_status_name: name of property representing a boolean flag within a table, which indicates, if value is

241 blob backed. If None, each value is assumed to be blob backed.

242 :param blob_backend: actual backend to use for storage

243 :param blob_prefix: prefix to use for blob in storage, e.g. a table name

244 :param max_workers: maximal number of workers to load data from blob storage

245 """

246 self.blob_prefix = blob_prefix

247 self.property_blob_status_name = property_boolean_blob_status_name

248 self.blob_backend = blob_backend

249 self.max_workers = max_workers

250 self.propertyName = property_name

251

252 def load_property_value(self, entity: Entity):

253 if self._is_entity_value_blob_backed(entity):

254 entity[self.propertyName] = self.blob_backend.get_value_from_reference(entity[self.propertyName])

255

256 def write_property_value(self, entity: Entity):

257 if self.propertyName in entity.keys():

258 if self._need_to_write_to_blob(entity[self.propertyName]):

259 value_identifier = self.blob_backend.get_value_reference(entity["PartitionKey"], entity["RowKey"], self.propertyName,

260 blob_name_prefix=self.blob_prefix)

261 value = entity[self.propertyName]

262 self.blob_backend.set_value_for_reference(value_identifier, value)

263 entity[self.propertyName] = value_identifier

264 property_blob_status = True if self.property_blob_status_name is not None else None

265 else:

266 property_blob_status = False if self.property_blob_status_name is not None else None

267

268 if property_blob_status is not None:

269 entity[self.property_blob_status_name] = property_blob_status

270

271 def load_property_value_to_data_frame_column(self, df: pd.DataFrame):

272 if self.propertyName in df.columns:

273 if self.property_blob_status_name is None:

274 df.loc[:, self.propertyName] = self._load_values_in_series(df[self.propertyName])

275 else:

276 df.loc[df[self.property_blob_status_name], self.propertyName] = \

277 self._load_values_in_series(df.loc[df[self.property_blob_status_name], self.propertyName])

278

279 def _need_to_write_to_blob(self, value):

280 if self.property_blob_status_name is None:

281 return True

282 if sys.getsizeof(value) > self.AZURE_ALLOWED_SIZE_PER_PROPERTY_BYTES:

283 return True

284 if isinstance(value, str) and len(value) > self.AZURE_ALLOWED_STRING_LENGTH_PER_PROPERTY:

285 return True

286 return False

287

288 def _is_entity_value_blob_backed(self, entity: Entity):

289 if self.propertyName not in entity.keys():

290 return False

291 if self.property_blob_status_name is None or self.property_blob_status_name not in entity:

292 return True

293 return entity[self.property_blob_status_name]

294

295 def _load_values_in_series(self, _series: pd.Series):

296 with ThreadPoolExecutor(max_workers=self.max_workers) as executor:

297 _series = list(executor.map(self.blob_backend.get_value_from_reference, _series))

298 return _series

299

300

301class BlobBackedSerialisedPropertyLoader(BlobBackedPropertyLoader, SerialisedPropertyLoader):

302 """

303 Property loader, which combines serialisation and blob backing.

304 """

305 def __init__(self, property_name, serialiser: Serialiser, blob_backend: AzureTableBlobBackend, blob_prefix: str = None,

306 property_boolean_blob_status_name: str = None, max_workers=None):

307 """

308

309

310 :param property_name: name of property in table

311 :param serialiser:

312 :param property_boolean_blob_status_name: name of property representing a boolean flag within a table, which indicates, if value is

313 blob backed. If None, each value is assumed to be blob backed.

314 :param blob_backend: actual backend to use for storage

315 :param blob_prefix: prefix to use for blob in storage, e.g. a table name

316 :param max_workers: maximal number of workers to load data from blob storage

317 """

318 SerialisedPropertyLoader.__init__(self, property_name, serialiser)

319 BlobBackedPropertyLoader.__init__(self, property_name, blob_backend, blob_prefix, property_boolean_blob_status_name, max_workers)

320

321 def load_property_value(self, entity: Entity):

322 super(BlobBackedPropertyLoader, self).load_property_value(entity)

323 super(SerialisedPropertyLoader, self).load_property_value(entity)

324

325 def write_property_value(self, entity: Entity):

326 super(SerialisedPropertyLoader, self).write_property_value(entity)

327 super(BlobBackedPropertyLoader, self).write_property_value(entity)

328

329 def load_property_value_to_data_frame_column(self, df: pd.DataFrame):

330 super(BlobBackedPropertyLoader, self).load_property_value_to_data_frame_column(df)

331 super(SerialisedPropertyLoader, self).load_property_value_to_data_frame_column(df)

332

333

334class AzureLazyBatchCommitTable:

335 """

336 Wrapper for an Azure table, which allow for convenient insertion via lazy batch execution per partition.

337 Uses a priority queue to manage order of partitions to be committed.

338 To execute insertions, call :func:`LazyBatchCommitTable.commit`

339 """

340

341 AZURE_ALLOWED_TABLE_NAME_PATTERN = re.compile("^[A-Za-z][A-Za-z0-9]{2,62}$")

342 AZURE_ALLOWED_TABLE_BATCH_SIZE = 100

343

344 class PartitionCommandsPriorityQueue:

345

346 class PartitionCommands:

347 def __init__(self, partition_key):

348 self.partition_key = partition_key

349 self._command_list = collections.deque()

350

351 def __len__(self):

352 return len(self._command_list)

353

354 def append(self, command):

355 self._command_list.append(command)

356

357 def execute(self, context_manager: Callable[[], TableBatch], batch_size: int):

358 while len(self._command_list) > 0:

359 _slice = [self._command_list.popleft() for _ in range(min(batch_size, len(self._command_list)))]

360 _log.info(f"Committing {len(_slice)} cache entries to the database")

361 with context_manager() as batch:

362 for command in _slice:

363 command(batch)

364

365 def __init__(self):

366 self.partition_commands_queue = []

367 self.partition_key2_commands = {}

368 self._thread_lock = threading.Lock()

369

370 def add_command(self, partition_key, command: Union[Callable[[TableBatch], Any], functools.partial[TableBatch]]):

371 """

372 Add a command to queue of corresponding partitionKey

373 :param partition_key:

374 :param command: a callable on a TableBatch

375 """

376 with self._thread_lock:

377 if partition_key not in self.partition_key2_commands:

378 commands = self.PartitionCommands(partition_key)

379 self.partition_commands_queue.append(commands)

380 self.partition_key2_commands[partition_key] = commands

381 self.partition_key2_commands[partition_key].append(command)

382

383 def pop(self, min_length: int = None) -> Optional[AzureLazyBatchCommitTable.PartitionCommandsPriorityQueue.PartitionCommands]:

384 """

385 :param min_length: minimal length of largest PartitionCommands for the pop to take place.

386 :return: largest PartitionCommands or None if minimal length is not reached

387 """

388 with self._thread_lock:

389 return self._pop(min_length)

390

391 def pop_all(self):

392 with self._thread_lock:

393 commands_list = []

394 while not self._is_empty():

395 commands_list.append(self._pop())

396 return commands_list

397

398 def is_empty(self):

399 with self._thread_lock:

400 return self._is_empty()

401

402 def _pop(self, min_length=None):

403 length, index = self._get_max_priority_info()

404 if index is not None and (min_length is None or length >= min_length):

405 q = self.partition_commands_queue.pop(index)

406 del self.partition_key2_commands[q.partition_key]

407 return q

408 else:

409 return None

410

411 def _is_empty(self):

412 return len(self.partition_commands_queue) == 0

413

414 def _get_max_priority_info(self):

415 lengths_list = list(map(len, self.partition_commands_queue))

416 if len(lengths_list) == 0:

417 return 0, None

418 max_length = max(lengths_list)

419 return max_length, lengths_list.index(max_length)

420

421 def __init__(self, table_name: str, table_service: TableService, property_loaders: Sequence[PropertyLoader] = ()):

422 """

423 :param table_name: name of table

424 :param table_service: instance of :class:`azure.storage.table.TableService` to connect to Azure table storage

425 :param property_loaders:

426 """

427

428 if not self.AZURE_ALLOWED_TABLE_NAME_PATTERN.match(table_name):

429 raise ValueError(f"Invalid table name {table_name}, see: "

430 f"https://docs.microsoft.com/en-us/rest/api/storageservices/Understanding-the-Table-Service-Data-Model")

431

432 self.table_service = table_service

433 self.table_name = table_name

434 self.property_loaders = property_loaders

435 self._partition_queues = self.PartitionCommandsPriorityQueue()

436 self._context_manager = functools.partial(self.table_service.batch, self.table_name)

437

438 if not self.exists():

439 self.table_service.create_table(self.table_name)

440

441 def insert_or_replace_entity(self, entity: Union[Dict, Entity]):

442 """

443 Lazy wrapper method for :func:`azure.storage.table.TableService.insert_or_replace_entity`

444 :param entity:

445 """

446 partition_key = entity["PartitionKey"]

447 for property_loader in self.property_loaders:

448 property_loader.write_property_value(entity)

449 execution_command = functools.partial(self._insert_or_replace_entity_via_batch, entity)

450 self._partition_queues.add_command(partition_key, execution_command)

451

452 def insert_entity(self, entity: Union[Dict, Entity]):

453 """

454 Lazy wrapper method for :func:`azure.storage.table.TableService.insert_entity`

455 :param entity:

456 """

457 partition_key = entity["PartitionKey"]

458 for property_loader in self.property_loaders:

459 property_loader.write_property_value(entity)

460 execution_command = functools.partial(self._insert_entity_via_batch, entity)

461 self._partition_queues.add_command(partition_key, execution_command)

462

463 def get_entity(self, partition_key: str, row_key: str) -> Optional[Entity]:

464 """

465 Wraps :func:`azure.storage.table.TableService.get_entity`

466 :param partition_key:

467 :param row_key:

468 :return:

469 """

470 try:

471 entity = self.table_service.get_entity(self.table_name, partition_key, row_key)

472 for property_loader in self.property_loaders:

473 property_loader.load_property_value(entity)

474 return entity

475 except Exception as e:

476 _log.debug(f"Unable to load value for partitionKey {partition_key} and rowKey {row_key} from table {self.table_name}: {e}")

477 return None

478

479 def commit_blocking_until_empty(self, max_batch_size=AZURE_ALLOWED_TABLE_BATCH_SIZE):

480 """

481 Commit insertion commands. Commands are executed batch-wise per partition until partition queue is empty in a

482 blocking manner.

483 :param max_batch_size: maximal batch size to use for batch insertion, must be less or equal to batch size allowed by Azure

484 """

485

486 max_batch_size = self._validate_max_batch_size(max_batch_size)

487

488 while not self._partition_queues.is_empty():

489 commands = self._partition_queues.pop()

490 commands.execute(self._context_manager, max_batch_size)

491

492 def commit_non_blocking_current_queue_state(self, max_batch_size=AZURE_ALLOWED_TABLE_BATCH_SIZE):

493 """

494 Commit insertion commands. Empties the current PartitionCommandsQueue in a non blocking way.

495 Commands are executed batch-wise per partition.

496 :param max_batch_size: maximal batch size to use for batch insertion, must be less or equal to batch size allowed by Azure

497 """

498

499 max_batch_size = self._validate_max_batch_size(max_batch_size)

500

501 def commit():

502 commands_list = self._partition_queues.pop_all()

503 for commands in commands_list:

504 commands.execute(self._context_manager, max_batch_size)

505

506 thread = threading.Thread(target=commit, daemon=False)

507 thread.start()

508

509 def commit_blocking_largest_partition_from_queue(self, max_batch_size=AZURE_ALLOWED_TABLE_BATCH_SIZE, min_length=None):

510 """

511 Commits in a blocking way the largest partition from PartitionCommandsQueue

512 :param max_batch_size: maximal batch size to use for batch insertion, must be less or equal to batch size allowed by Azure

513 :param min_length: minimal size of largest partition. If not None, pop and commit only if minLength is reached.

514 :return:

515 """

516 max_batch_size = self._validate_max_batch_size(max_batch_size)

517 commands = self._partition_queues.pop(min_length)

518 if commands is not None:

519 commands.execute(self._context_manager, max_batch_size)

520

521 def _validate_max_batch_size(self, max_batch_size):

522 if max_batch_size > self.AZURE_ALLOWED_TABLE_BATCH_SIZE:

523 _log.warning(f"Provided maxBatchSize is larger than allowed size {self.AZURE_ALLOWED_TABLE_BATCH_SIZE}. "

524 f"Will use maxBatchSize {self.AZURE_ALLOWED_TABLE_BATCH_SIZE} instead.")

525 max_batch_size = self.AZURE_ALLOWED_TABLE_BATCH_SIZE

526 return max_batch_size

527

528 def load_table_to_data_frame(self, columns: List[str] = None, row_filter_query: str = None, num_records: int = None):

529 """

530 Load all rows of table to :class:`~pandas.DataFrame`

531 :param row_filter_query:

532 :param num_records:

533 :param columns: restrict loading to provided columns

534 :return: :class:`~pandas.DataFrame`

535 """

536 if num_records is None:

537 records = list(self._iter_records(columns, row_filter_query))

538 else:

539 records = []

540 for record in self._iter_records(columns, row_filter_query):

541 records.append(record)

542 if len(records) >= num_records:

543 break

544 df = pd.DataFrame(records, columns=columns)

545 for property_loader in self.property_loaders:

546 property_loader.load_property_value_to_data_frame_column(df)

547 return df

548

549 def iter_data_frame_chunks(self, chunk_size: int, columns: List[str] = None, row_filter_query: str = None):

550 """

551 Get a generator of dataframe chunks

552 :param row_filter_query:

553 :param chunk_size:

554 :param columns:

555 :return:

556 """

557 records = []

558 for record in self._iter_records(columns, row_filter_query):

559 records.append(record)

560 if len(records) >= chunk_size:

561 df = pd.DataFrame(records, columns=columns)

562 for propertyLoader in self.property_loaders:

563 propertyLoader.load_property_value_to_data_frame_column(df)

564 yield df

565 records = []

566

567 def iter_records(self, columns: List[str] = None, row_filter_query: str = None):

568 """

569

570 Get a generator of table entities

571 :param row_filter_query:

572 :param columns:

573 :return:

574 """

575 for entity in self._iter_records(columns, row_filter_query):

576 for propertyLoader in self.property_loaders:

577 propertyLoader.load_property_value(entity)

578 yield entity

579

580 def _iter_records(self, columns: Optional[List[str]], row_filter_query: Optional[str]):

581 column_names_as_comma_separated_string = None

582 if columns is not None:

583 column_names_as_comma_separated_string = ",".join(columns)

584 return self.table_service.query_entities(self.table_name, select=column_names_as_comma_separated_string,

585 filter=row_filter_query)

586

587 def insert_data_frame_to_table(self, df: pd.DataFrame, partition_key_generator: Callable[[str], str] = None, num_records: int = None):

588 """

589 Inserts or replace entities of the table corresponding to rows of the DataFrame, where the index of the dataFrame acts as rowKey.

590 Values of object type columns in the dataFrame may have to be serialised via json beforehand.

591 :param df: DataFrame to be inserted

592 :param partition_key_generator: if None, partitionKeys default to tableName

593 :param num_records: restrict insertion to first numRecords rows, merely for testing

594 """

595 for (count, (idx, row)) in enumerate(df.iterrows()):

596 if num_records is not None:

597 if count >= num_records:

598 break

599 entity = row.to_dict()

600 entity["RowKey"] = idx

601 entity["PartitionKey"] = self.table_name if partition_key_generator is None else partition_key_generator(idx)

602 self.insert_or_replace_entity(entity)

603

604 @staticmethod

605 def _insert_or_replace_entity_via_batch(entity, batch: TableBatch):

606 return batch.insert_or_replace_entity(entity)

607

608 @staticmethod

609 def _insert_entity_via_batch(entity, batch: TableBatch):

610 return batch.insert_entity(entity)

611

612 def exists(self):

613 return self.table_service.exists(self.table_name)

614

615

616class AzureTablePersistentKeyValueCache(PersistentKeyValueCache):

617 """

618 PersistentKeyValueCache using Azure Table Storage, see https://docs.microsoft.com/en-gb/azure/storage/tables/

619 """

620 CACHE_VALUE_IDENTIFIER = "cache_value"

621

622 def __init__(self, table_service: TableService, table_name="cache", partition_key_generator: Callable[[str], str] = None,

623 max_batch_size=100, min_size_for_periodic_commit: Optional[int] = 100, deferred_commit_delay_secs=1.0, in_memory=False,

624 blob_backend: AzureTableBlobBackend = None, serialiser: Serialiser = None, max_workers: int = None):

625 """

626 :param table_service: https://docs.microsoft.com/en-us/python/api/azure-cosmosdb-table/azure.cosmosdb.table.tableservice.tableservice?view=azure-python

627 :param table_name: name of table, needs to match restrictions for Azure storage resources, see https://docs.microsoft.com/en-gb/azure/azure-resource-manager/management/resource-name-rules

628 :param partition_key_generator: callable to generate a partitionKey from provided string, if None partitionKey in requests defaults

629 to tableName

630 :param max_batch_size: maximal batch size for each commit.

631 :param deferred_commit_delay_secs: the time frame during which no new data must be added for a pending transaction to be committed

632 :param min_size_for_periodic_commit: minimal size of a batch to be committed in a periodic thread.

633 If None, commits are only executed in a deferred manner, i.e. commit only if there is no update for `deferred_commit_delay_secs`

634 :param in_memory: boolean flag, to indicate, if table should be loaded in memory at construction

635 :param blob_backend: if not None, blob storage will be used to store actual value and cache_value in table only contains a reference

636 :param max_workers: maximal number of workers to load data from blob backend

637 """

638

639 self._deferredCommitDelaySecs = deferred_commit_delay_secs

640 self._partitionKeyGenerator = partition_key_generator

641

642 def create_property_loaders():

643 if blob_backend is None and serialiser is None:

644 _property_loaders = ()

645 elif blob_backend is None and serialiser is not None:

646 _property_loaders = (SerialisedPropertyLoader(self.CACHE_VALUE_IDENTIFIER, serialiser),)

647 elif blob_backend is not None and serialiser is None:

648 property_blob_status_name = self.CACHE_VALUE_IDENTIFIER + "_blob_backed"

649 _property_loaders = (BlobBackedPropertyLoader(self.CACHE_VALUE_IDENTIFIER, blob_backend, table_name,

650 property_blob_status_name, max_workers),)

651 else:

652 property_blob_status_name = self.CACHE_VALUE_IDENTIFIER + "_blob_backed"

653 _property_loaders = (BlobBackedSerialisedPropertyLoader(self.CACHE_VALUE_IDENTIFIER, serialiser, blob_backend,

654 table_name, property_blob_status_name, max_workers),)

655 return _property_loaders

656

657 property_loaders = create_property_loaders()

658 self._batch_commit_table = AzureLazyBatchCommitTable(table_name, table_service, property_loaders=property_loaders)

659 self._minSizeForPeriodicCommit = min_size_for_periodic_commit

660 self._maxBatchSize = max_batch_size

661 self._updateHook = PeriodicUpdateHook(deferred_commit_delay_secs, no_update_fn=self._commit, periodic_fn=self._periodically_commit)

662

663 self._in_memory_cache = None

664

665 if in_memory:

666 df = self._batch_commit_table.load_table_to_data_frame(columns=['RowKey', self.CACHE_VALUE_IDENTIFIER]).set_index("RowKey")

667 _log.info(f"Loaded {len(df)} entries of table {table_name} in memory")

668 self._in_memory_cache = df[self.CACHE_VALUE_IDENTIFIER].to_dict()

669

670 def set(self, key, value):

671 key_as_string = str(key)

672 partition_key = self._get_partition_key_for_row_key(key_as_string)

673 entity = {'PartitionKey': partition_key, 'RowKey': key_as_string, self.CACHE_VALUE_IDENTIFIER: value}

674 self._batch_commit_table.insert_or_replace_entity(entity)

675 self._updateHook.handle_update()

676

677 if self._in_memory_cache is not None:

678 self._in_memory_cache[key_as_string] = value

679

680 def get(self, key):

681 key_as_string = str(key)

682 value = self._get_from_in_memory_cache(key_as_string)

683 if value is None:

684 value = self._get_from_table(key_as_string)

685 return value

686

687 def _get_from_table(self, key: str):

688 partition_key = self._get_partition_key_for_row_key(key)

689 entity = self._batch_commit_table.get_entity(partition_key, key)

690 if entity is not None:

691 return entity[self.CACHE_VALUE_IDENTIFIER]

692 return None

693

694 def _get_from_in_memory_cache(self, key):

695 if self._in_memory_cache is None:

696 return None

697 return self._in_memory_cache.get(str(key), None)

698

699 def _get_partition_key_for_row_key(self, key: str):

700 return self._batch_commit_table.table_name if self._partitionKeyGenerator is None else self._partitionKeyGenerator(key)

701

702 def _commit(self):

703 self._batch_commit_table.commit_non_blocking_current_queue_state(self._maxBatchSize)

704

705 def _periodically_commit(self):

706 self._batch_commit_table.commit_blocking_largest_partition_from_queue(self._maxBatchSize, self._minSizeForPeriodicCommit)

Coverage for src/sensai/util/cache_azure.py: 0%

407 statements