Coverage for src/pycse/hashcache.py: 95.14%

185 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-10-23 16:23 -0400

1"""hashcache - a class decorator for persistent, file/hash-based cache. 

2 

3I found some features of joblib were unsuitable for how I want to use a cache. 

4 

51. The "file" Python thinks the function is in is used to save the results in 

6joblib, which leads to repeated runs if you run the same code in Python, 

7notebook or stdin, and means the cache is not portable to other machines, and 

8maybe not even in time since temp directories and kernel parameters are 

9involved. I could not figure out how to change those in joblib. 

10 

112. joblib uses the function source code in the hash, so inconsequential changes 

12like whitespace, docstrings and comments change the hash. 

13 

14This library aims to provide a simpler version of what I wish joblib did for me. 

15 

16Results are cached based on a hash of the function name, argnames, bytecode, arg 

17values and kwarg values. I use joblib.hash for this. This means any two 

18functions with the same bytecode, even if they have different names, will cache 

19to the same result. 

20 

21The cache location is set as a class attribute: 

22 

23 HashCache.cache = './cache' 

24 

25 

26 HashCache - stores joblib.dump pickle strings in files named by hash 

27 

28 

29 SqlCache - stores orjson serialized data in a sqlite3 database by hash key 

30 

31 JsonCache - stores orjson serialized data in json files, compatible with maggma 

32 

33 

34This is still alpha, proof of concept code. Test it a lot for your use case. The 

35API is not stable, and subject to change. 

36 

37 

38Pros: 

39 

401. File-based cache which means many functions can run in parallel reading and 

41writing, and you are limited only by file io speeds, and disk space. 

42 

432. semi-portability. The cache could be synced across machines, and caches 

44can be merged with little risk of conflict. 

45 

463. No server is required. Everything is done at the OS level. 

47 

484. Extendability. You can define your own functions for loading and dumping 

49data. 

50 

51Cons: 

52 

531. hashes are fragile and not robust. They are fragile with respect to any 

54changes in how byte-code is made, or via mutable arguments, etc. The hashes are 

55not robust to system level changes like library versions, or global variables. 

56The only advantage of hashes is you can compute them. 

57 

582. File-based cache which means if you generate thousands of files, it can be 

59slow to delete them. Although it should be fast to access the results since you 

60access them directly by path, it will not be fast to iterate over all the 

61results, e.g. if you want to implement some kind of search or reporting. 

62 

633. No server. You have to roll your own update strategy if you run things on 

64multiple machines that should all cache to a common location. 

65 

66Changelog 

67--------- 

68 

69[2023-09-23 Sat] Changed hash signature (breaking change). It is too difficult 

70to figure out how to capture global state, and the use of internal variable 

71names is not consistent with using the bytecode to be insensitive to 

72unimportant variable name changes. 

73 

74Pulled out some functions for loading and dumping data. This is a precursor to 

75enabling other backends like lmdb or sqlite instead of files. You can then 

76simply provide new functions for this. 

77 

78[2024-06-18 Tue] Changed from function to class decorator (breaking change). 

79 

80""" 

81 

82import inspect 

83import joblib 

84import os 

85from pathlib import Path 

86import pprint 

87import socket 

88import sqlite3 

89import time 

90 

91 

92def hashcache(*args, **kwargs): 

93 """Raise an exception if the old hashcache decorator is used.""" 

94 raise Exception( 

95 """The hashcache function decorator is deprecated. 

96 Please use the class decorator instead. 

97 

98 For example: 

99 

100 from pycse.hashcache import HashCache 

101 

102 @HashCache 

103 def f(x): 

104 return x 

105 """ 

106 ) 

107 

108 

109class HashCache: 

110 """Class decorator to cache using hashes and pickle (via joblib). 

111 

112 Data is stored in directories named by the hash. 

113 """ 

114 

115 # cache is the name of the directory to store results in 

116 cache = "cache" 

117 version = "0.1.0" 

118 verbose = False 

119 

120 def __init__(self, function): 

121 """Decorate the function.""" 

122 self.function = function 

123 

124 def get_standardized_args(self, args, kwargs): 

125 """Return a standardized dictionary of kwargs for func(args, kwargs). 

126 

127 This dictionary includes default values, even if they were not called. 

128 

129 """ 

130 sig = inspect.signature(self.function) 

131 standardized_args = sig.bind(*args, **kwargs) 

132 standardized_args.apply_defaults() 

133 return standardized_args.arguments 

134 

135 def get_hash(self, args, kwargs): 

136 """Get a hash for running FUNC(ARGS, KWARGS). 

137 

138 This is the most critical feature of hashcache as it provides a key to 

139 store and look up results later. You should think carefully before 

140 changing this function, it breaks past caches. 

141 

142 FUNC should be as pure as reasonable. This hash is insensitive to global 

143 variables. 

144 

145 The hash is on the function name, bytecode, and a standardized kwargs 

146 including defaults. We use bytecode because it is insensitive to things 

147 like whitespace, comments, docstrings, and variable name changes that 

148 don't affect results. It is assumed that two functions with the same 

149 name and bytecode will evaluate to the same result. However, this makes 

150 the hash fragile to changes in Python version that affect bytecode. 

151 

152 """ 

153 return joblib.hash( 

154 [ 

155 self.function.__code__.co_name, # This is the function name 

156 self.function.__code__.co_code, # this is the function bytecode 

157 # The args used, including defaults 

158 self.get_standardized_args(args, kwargs), 

159 ], 

160 hash_name="sha1", 

161 ) 

162 

163 def get_hashpath(self, hsh): 

164 """Return path to file for HSH.""" 

165 cache = Path(self.cache) 

166 hshdir = cache / hsh[0:2] 

167 hshpath = hshdir / hsh 

168 return hshpath 

169 

170 def load_data(self, hsh): 

171 """Load data for HSH. 

172 

173 HSH is a string for the hash associated with the data you want. 

174 

175 Returns success, data. If it succeeds, success with be True. If the data 

176 does not exist yet, sucess will be False, and data will be None. 

177 

178 """ 

179 hshpath = self.get_hashpath(hsh) 

180 if os.path.exists(hshpath): 

181 data = joblib.load(hshpath) 

182 if self.verbose: 

183 pp = pprint.PrettyPrinter(indent=4) 

184 pp.pprint(data) 

185 return True, data["output"] 

186 else: 

187 return False, None 

188 

189 def dump_data(self, hsh, data): 

190 """Dump DATA into HSH.""" 

191 hshpath = self.get_hashpath(hsh) 

192 os.makedirs(hshpath.parent, exist_ok=True) 

193 

194 files = joblib.dump(data, hshpath) 

195 

196 if self.verbose: 

197 pp = pprint.PrettyPrinter(indent=4) 

198 print(f"wrote {hshpath}") 

199 pp.pprint(data) 

200 

201 return files 

202 

203 def __call__(self, *args, **kwargs): 

204 """Code to run around self.function.""" 

205 hsh = self.get_hash(args, kwargs) 

206 

207 # Try getting the data first 

208 success, data = self.load_data(hsh) 

209 

210 if success: 

211 return data 

212 

213 # we did not succeed, so we run the function, and cache it 

214 # We store some metadata for future analysis. 

215 t0 = time.time() 

216 value = self.function(*args, **kwargs) 

217 tf = time.time() 

218 

219 # functions with mutable arguments can change the arguments, which 

220 # is a problem here. We just warn the user. Nothing else makes 

221 # sense, the mutability may be intentional. 

222 if not hsh == self.get_hash(args, kwargs): 

223 print("WARNING something mutated, future calls will not use the cache.") 

224 

225 # Try a bunch of ways to get a username. 

226 try: 

227 user = os.getlogin() 

228 except OSError: 

229 user = os.environ.get("USER") 

230 

231 data = { 

232 "output": value, 

233 "hash": hsh, 

234 "func": self.function.__code__.co_name, # This is the function name 

235 "module": self.function.__module__, 

236 "args": args, 

237 "kwargs": kwargs, 

238 "standardized-kwargs": self.get_standardized_args(args, kwargs), 

239 "version": self.version, 

240 "cwd": os.getcwd(), 

241 "hostname": socket.getfqdn(), 

242 "user": user, 

243 "run-at": t0, 

244 "run-at-human": time.asctime(time.localtime(t0)), 

245 "elapsed_time": tf - t0, 

246 } 

247 

248 self.dump_data(hsh, data) 

249 return value 

250 

251 @staticmethod 

252 def dump(**kwargs): 

253 """Dump KWARGS to the cache. 

254 

255 Returns a hash string for future lookup. 

256 

257 cache is a special kwarg that is not saved 

258 

259 """ 

260 t0 = time.time() 

261 hsh = joblib.hash(kwargs) 

262 

263 try: 

264 user = os.getlogin() 

265 except OSError: 

266 user = os.environ.get("USER") 

267 

268 if "cache" in kwargs: 

269 cache = kwargs["cache"] 

270 del kwargs["cache"] 

271 else: 

272 cache = "cache" 

273 

274 data = { 

275 "func": "dump", 

276 "kwargs": kwargs, 

277 "hash": hsh, 

278 "saved-at": t0, 

279 "saved-at-human": time.asctime(time.localtime(t0)), 

280 "cwd": os.getcwd(), 

281 "hostname": socket.getfqdn(), 

282 "user": user, 

283 } 

284 

285 hc = HashCache(lambda x: x) 

286 hc.cache = cache 

287 hc.dump_data(hsh, data) 

288 return hsh 

289 

290 @staticmethod 

291 def load(hsh, cache="cache"): 

292 """Load saved variables from HSH.""" 

293 hc = HashCache(lambda x: x) 

294 hc.cache = cache 

295 

296 hshpath = hc.get_hashpath(hsh) 

297 if os.path.exists(hshpath): 

298 return joblib.load(hshpath)["kwargs"] 

299 

300 

301class SqlCache(HashCache): 

302 """Class decorator to cache using orjson and sqlite. 

303 

304 Data is stored in a sqlite database as json. 

305 

306 """ 

307 

308 cache = "cache.sqlite" 

309 

310 # default is a serializing function for orjson 

311 # I guess the signature is default(self, obj) 

312 default = None 

313 

314 def __init__(self, function): 

315 """Initialize the class.""" 

316 self.function = function 

317 

318 self.con = sqlite3.connect(self.cache) 

319 self.con.execute("CREATE TABLE if not exists cache(hash TEXT unique, value TEXT)") 

320 

321 def dump_data(self, hsh, data): 

322 """Dump DATA into HSH. 

323 

324 DATA must be serializable to json. 

325 

326 """ 

327 import orjson 

328 

329 value = orjson.dumps(data, default=self.default, option=orjson.OPT_SERIALIZE_NUMPY) 

330 with self.con: 

331 self.con.execute("INSERT INTO cache(hash, value) VALUES(?, ?)", (hsh, value)) 

332 

333 def load_data(self, hsh): 

334 """Load data for HSH. 

335 

336 HSH is a string for the hash associated with the data you want. 

337 

338 Returns success, data. If it succeeds, success with be True. If the data 

339 does not exist yet, sucess will be False, and data will be None. 

340 

341 """ 

342 import orjson 

343 

344 with self.con: 

345 cur = self.con.execute("SELECT value FROM cache WHERE hash = ?", (hsh,)) 

346 value = cur.fetchone() 

347 if value is None: 

348 return False, None 

349 else: 

350 return True, orjson.loads(value[0])["output"] 

351 

352 @staticmethod 

353 def search(query, *args): 

354 """Run a sql QUERY with args. 

355 

356 args are substituted in ? placeholders in the query. 

357 

358 This is just a light wrapper on con.execute. 

359 

360 """ 

361 con = sqlite3.connect(SqlCache.cache) 

362 cur = con.execute(query, args) 

363 return cur 

364 

365 @staticmethod 

366 def dump(**kwargs): 

367 """Dump KWARGS to the cache. 

368 

369 Returns a hash string for future lookup. 

370 """ 

371 t0 = time.time() 

372 hsh = joblib.hash(kwargs) 

373 

374 try: 

375 user = os.getlogin() 

376 except OSError: 

377 user = os.environ.get("USER") 

378 

379 data = { 

380 "func": "dump", 

381 "kwargs": kwargs, 

382 "hash": hsh, 

383 "saved-at": t0, 

384 "saved-at-human": time.asctime(time.localtime(t0)), 

385 "cwd": os.getcwd(), 

386 "hostname": socket.getfqdn(), 

387 "user": user, 

388 } 

389 

390 hc = SqlCache(lambda x: x) 

391 try: 

392 hc.dump_data(hsh, data) 

393 return hsh 

394 except sqlite3.IntegrityError: 

395 return hsh 

396 

397 @staticmethod 

398 def load(hsh): 

399 """Load data from HSH.""" 

400 import orjson 

401 

402 hc = SqlCache(lambda x: x) 

403 with hc.con: 

404 cur = hc.con.execute("SELECT value FROM cache WHERE hash = ?", (hsh,)) 

405 (value,) = cur.fetchone() # this returns a tuple that we unpack 

406 return orjson.loads(value)["kwargs"] 

407 

408 

409class JsonCache(HashCache): 

410 """Json-based cache. 

411 

412 This is compatible with maggma. 

413 """ 

414 

415 default = None 

416 

417 def __init__(self, function): 

418 """Initialize the class.""" 

419 import orjson 

420 

421 self.function = function 

422 

423 if not os.path.exists(self.cache / Path("Filestore.json")): 

424 os.makedirs(self.cache, exist_ok=True) 

425 with open(self.cache / Path("Filestore.json"), "wb") as f: 

426 f.write(orjson.dumps([])) 

427 

428 def dump_data(self, hsh, data): 

429 """Dump DATA into HSH.""" 

430 import orjson 

431 

432 hshpath = self.get_hashpath(hsh).with_suffix(".json") 

433 os.makedirs(hshpath.parent, exist_ok=True) 

434 

435 with open(hshpath, "wb") as f: 

436 f.write(orjson.dumps(data, default=self.default, option=orjson.OPT_SERIALIZE_NUMPY)) 

437 

438 def load_data(self, hsh): 

439 """Load data from hsh.""" 

440 import orjson 

441 

442 hshpath = self.get_hashpath(hsh).with_suffix(".json") 

443 if os.path.exists(hshpath): 

444 with open(hshpath, "rb") as f: 

445 data = orjson.loads(f.read()) 

446 

447 if self.verbose: 

448 pp = pprint.PrettyPrinter(indent=4) 

449 pp.pprint(data) 

450 return True, data["output"] 

451 else: 

452 return False, None 

453 

454 @staticmethod 

455 def dump(**kwargs): 

456 """Dump KWARGS to the cache. 

457 

458 Returns a hash string for future lookup. 

459 """ 

460 import orjson 

461 

462 t0 = time.time() 

463 hsh = joblib.hash(kwargs) 

464 

465 try: 

466 user = os.getlogin() 

467 except OSError: 

468 user = os.environ.get("USER") 

469 

470 data = { 

471 "func": "dump", 

472 "kwargs": kwargs, 

473 "hash": hsh, 

474 "saved-at": t0, 

475 "saved-at-human": time.asctime(time.localtime(t0)), 

476 "cwd": os.getcwd(), 

477 "hostname": socket.getfqdn(), 

478 "user": user, 

479 } 

480 

481 hc = JsonCache(lambda x: x) 

482 hshpath = hc.get_hashpath(hsh).with_suffix(".json") 

483 

484 os.makedirs(hshpath.parent, exist_ok=True) 

485 with open(hshpath, "wb") as f: 

486 f.write(orjson.dumps(data, default=hc.default, option=orjson.OPT_SERIALIZE_NUMPY)) 

487 return hsh 

488 

489 @staticmethod 

490 def load(hsh): 

491 """Load data from HSH.""" 

492 import orjson 

493 

494 hc = JsonCache(lambda x: x) 

495 hshpath = hc.get_hashpath(hsh).with_suffix(".json") 

496 if os.path.exists(hshpath): 

497 with open(hshpath, "rb") as f: 

498 return orjson.loads(f.read())["kwargs"]