Import python modules by their hash.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

524 lines
14 KiB

  1. # Copyright 2020 John-Mark Gurney.
  2. # All rights reserved.
  3. #
  4. # Redistribution and use in source and binary forms, with or without
  5. # modification, are permitted provided that the following conditions
  6. # are met:
  7. # 1. Redistributions of source code must retain the above copyright
  8. # notice, this list of conditions and the following disclaimer.
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  14. # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  15. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  16. # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  17. # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  18. # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  19. # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  20. # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  21. # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  22. # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  23. # SUCH DAMAGE.
  24. import contextlib
  25. import glob
  26. import hashlib
  27. import importlib.resources
  28. import mock
  29. import os.path
  30. import pathlib
  31. import shutil
  32. import sys
  33. import tempfile
  34. import urllib
  35. from importlib.abc import MetaPathFinder, Loader
  36. from importlib.machinery import ModuleSpec
  37. @contextlib.contextmanager
  38. def tempset(obj, key, value):
  39. '''A context (with) manager for changing the value of an item in a
  40. dictionary, and restoring it after the with block.
  41. Example usage:
  42. ```
  43. d = dict(a=5, b=10)
  44. with tempset(d, 'a', 15):
  45. print(repr(d['a'])
  46. print(repr(d['a'])
  47. ```
  48. '''
  49. try:
  50. oldvalue = obj[key]
  51. obj[key] = value
  52. yield
  53. finally:
  54. obj[key] = oldvalue
  55. @contextlib.contextmanager
  56. def tempattrset(obj, key, value):
  57. '''A context (with) manager for changing the value of an attribute
  58. of an object, and restoring it after the with block.
  59. If the attribute does not exist, it will be deleted afterward.
  60. Example usage:
  61. ```
  62. with tempattrset(someobj, 'a', 15):
  63. print(repr(someobj.a)
  64. print(repr(someobj.a)
  65. ```
  66. '''
  67. try:
  68. dodelattr = False
  69. if hasattr(obj, key):
  70. oldvalue = getattr(obj, key)
  71. else:
  72. dodelattr = True
  73. setattr(obj, key, value)
  74. yield
  75. finally:
  76. if not dodelattr:
  77. setattr(obj, key, oldvalue)
  78. else:
  79. delattr(obj, key)
  80. class FileDirCAS(object):
  81. '''A file loader for CAS that operates on a directory. It looks
  82. at files, caches their hash, and loads them upon request.'''
  83. def __init__(self, path):
  84. self._path = pathlib.Path(path)
  85. self._hashes = {}
  86. def refresh_dir(self):
  87. '''Internal method to refresh the internal cache of
  88. hashes.'''
  89. for i in glob.glob(os.path.join(self._path, '*.py')):
  90. _, hash = self.read_hash_file(i)
  91. self._hashes[hash] = i
  92. @staticmethod
  93. def read_hash_file(fname):
  94. '''Helper function that will read the file at fname, and
  95. return the tuple of it's contents and it's hash.'''
  96. with open(fname, 'rb') as fp:
  97. data = fp.read()
  98. hash = hashlib.sha256(data).hexdigest()
  99. return data, hash
  100. def is_package(self, hash):
  101. '''Decode the provided hash, and decide if it's a package
  102. or not.'''
  103. return False
  104. def fetch_data(self, url):
  105. '''Given the URL (must be a hash URL), return the code for it.'''
  106. self.refresh_dir()
  107. hashurl = url
  108. if hashurl.scheme != 'hash' or hashurl.netloc != 'sha256':
  109. raise ValueError('invalid hash url')
  110. hash = hashurl.path[1:]
  111. fname = self._hashes[hash]
  112. data, fhash = self.read_hash_file(fname)
  113. if fhash != hash:
  114. raise ValueError('file no longer matches hash on disk')
  115. return data
  116. class CASFinder(MetaPathFinder, Loader):
  117. '''Overall class for using Content Addressable Storage to load
  118. Python modules into your code. It contains code to dispatch to
  119. the various loaders to attempt to load the hash.'''
  120. def __init__(self):
  121. self._loaders = []
  122. self._aliases = {}
  123. if [ x for x in sys.meta_path if isinstance(x, self.__class__) ]:
  124. raise RuntimeError('cannot register more than on CASFinder')
  125. sys.meta_path.append(self)
  126. def __enter__(self):
  127. return self
  128. def __exit__(self, exc_type, exc_value, traceback):
  129. self.disconnect()
  130. def load_aliases(self, name):
  131. '''Load the aliases from the module with the passed in name.'''
  132. aliases = importlib.resources.read_text(sys.modules[name], 'cas_aliases.txt')
  133. self._aliases.update(self._parsealiases(aliases))
  134. @staticmethod
  135. def _makebasichashurl(url):
  136. try:
  137. hashurl = urllib.parse.urlparse(url)
  138. except AttributeError:
  139. hashurl = url
  140. return urllib.parse.urlunparse(hashurl[:3] + ('', '', ''))
  141. @classmethod
  142. def _parsealiases(cls, data):
  143. ret = {}
  144. lines = data.split('\n')
  145. for i in lines:
  146. if not i:
  147. continue
  148. name, hash = i.split()
  149. ret.setdefault(name, []).append(hash)
  150. # split out the hashes
  151. for items in list(ret.values()):
  152. lst = [ x for x in items if not x.startswith('hash://') ]
  153. for h in [ x for x in items if x.startswith('hash://') ]:
  154. h = cls._makebasichashurl(h)
  155. ret[h] = lst
  156. return ret
  157. def disconnect(self):
  158. '''Disconnect this Finder from being used to load modules.
  159. As this claims an entire namespace, only the first loaded
  160. one will work, and any others will be hidden until the
  161. first one is disconnected.
  162. This can be used w/ a with block to automatically
  163. disconnect when no longer needed. This is mostly useful
  164. for testing.'''
  165. try:
  166. sys.meta_path.remove(self)
  167. except ValueError:
  168. pass
  169. def register(self, loader):
  170. '''Register a loader w/ this finder. This will attempt
  171. to load the hash passed to it. It is also (currently)
  172. responsible for executing the code in the module.'''
  173. self._loaders.append(loader)
  174. # MetaPathFinder methods
  175. def find_spec(self, fullname, path, target=None):
  176. if path is None:
  177. ms = ModuleSpec(fullname, self, is_package=True)
  178. else:
  179. parts = fullname.split('.')
  180. ver, typ, arg = parts[1].split('_')
  181. if typ == 'f':
  182. # make hash url:
  183. hashurl = 'hash://sha256/%s' % bytes.fromhex(arg).hex()
  184. hashurl = urllib.parse.urlparse(hashurl)
  185. for l in self._loaders:
  186. ispkg = l.is_package(hashurl)
  187. break
  188. else:
  189. return None
  190. else:
  191. # an alias
  192. for i in self._aliases[arg]:
  193. hashurl = urllib.parse.urlparse(i)
  194. if hashurl.scheme == 'hash':
  195. break
  196. else:
  197. raise ValueError('unable to find bash hash url for alias %s' % repr(arg))
  198. ms = ModuleSpec(fullname, self, is_package=False, loader_state=(hashurl,))
  199. return ms
  200. def invalidate_caches(self):
  201. return None
  202. # Loader methods
  203. def exec_module(self, module):
  204. if module.__name__ == 'cas':
  205. pass
  206. else:
  207. (url,) = module.__spec__.loader_state
  208. for load in self._loaders:
  209. try:
  210. data = load.fetch_data(url)
  211. break
  212. except Exception:
  213. pass
  214. else:
  215. for url in self._aliases[self._makebasichashurl(url)]:
  216. url = urllib.parse.urlparse(url)
  217. for load in self._loaders:
  218. try:
  219. data = load.fetch_data(url)
  220. break
  221. except Exception:
  222. pass
  223. else:
  224. continue
  225. break
  226. else:
  227. raise ValueError('unable to find loader for url %s' % repr(urllib.parse.urlunparse(url)))
  228. exec(data, module.__dict__)
  229. def defaultinit(casf):
  230. cachedir = pathlib.Path.home() / '.casimport_cache'
  231. cachedir.mkdir(exist_ok=True)
  232. casf.register(FileDirCAS(cachedir))
  233. # The global version
  234. _casfinder = CASFinder()
  235. load_aliases = _casfinder.load_aliases
  236. defaultinit(_casfinder)
  237. import unittest
  238. class TestHelpers(unittest.TestCase):
  239. def test_testset(self):
  240. origobj = object()
  241. d = dict(a=origobj, b=10)
  242. # that when we temporarily set it
  243. with tempset(d, 'a', 15):
  244. # the new value is there
  245. self.assertEqual(d['a'], 15)
  246. # and that the original object is restored
  247. self.assertIs(d['a'], origobj)
  248. def test_testattrset(self):
  249. class TestObj(object):
  250. pass
  251. testobj = TestObj()
  252. # that when we temporarily set it
  253. with tempattrset(testobj, 'a', 15):
  254. # the new value is there
  255. self.assertEqual(testobj.a, 15)
  256. # and that there is no object
  257. self.assertFalse(hasattr(testobj, 'a'))
  258. origobj = object()
  259. newobj = object()
  260. testobj.b = origobj
  261. # that when we temporarily set it
  262. with tempattrset(testobj, 'b', newobj):
  263. # the new value is there
  264. self.assertIs(testobj.b, newobj)
  265. # and the original value is restored
  266. self.assertIs(testobj.b, origobj)
  267. class Test(unittest.TestCase):
  268. def setUp(self):
  269. # clear out the default casfinder if there is one
  270. self.old_meta_path = sys.meta_path
  271. sys.meta_path = [ x for x in sys.meta_path if not isinstance(x, CASFinder) ]
  272. # setup temporary directory
  273. d = pathlib.Path(os.path.realpath(tempfile.mkdtemp()))
  274. self.basetempdir = d
  275. self.tempdir = d / 'subdir'
  276. self.tempdir.mkdir()
  277. self.fixtures = pathlib.Path(__file__).parent.parent / 'fixtures'
  278. def tearDown(self):
  279. # restore environment
  280. sys.meta_path = self.old_meta_path
  281. importlib.invalidate_caches()
  282. # clean up sys.modules
  283. [ sys.modules.pop(x) for x in list(sys.modules.keys()) if
  284. x == 'cas' or x.startswith('cas.') ]
  285. shutil.rmtree(self.basetempdir)
  286. self.tempdir = None
  287. def test_filedircas_limit_refresh(self):
  288. # XXX - only refresh when the dir has changed, and each
  289. # file has changed
  290. pass
  291. def test_casimport(self):
  292. # That a CASFinder
  293. f = CASFinder()
  294. # make sure that we can't import anything at first
  295. with self.assertRaises(ImportError):
  296. import cas.v1_f_2398472398
  297. # when registering the fixtures directory
  298. f.register(FileDirCAS(self.fixtures))
  299. # can import the function
  300. from cas.v1_f_330884aa2febb5e19fb7194ec6a69ed11dd3d77122f1a5175ee93e73cf0161c3 import hello
  301. name = 'Olof'
  302. # and run the code
  303. self.assertEqual(hello(name), 'hello ' + name)
  304. # and when finished, can disconnect
  305. f.disconnect()
  306. # and is no longer in the meta_path
  307. self.assertNotIn(f, sys.meta_path)
  308. # and when disconnected as second time, nothing happens
  309. f.disconnect()
  310. def test_defaultinit(self):
  311. temphome = self.tempdir / 'home'
  312. temphome.mkdir()
  313. cachedir = temphome / '.casimport_cache'
  314. with tempset(os.environ, 'HOME', str(temphome)):
  315. with CASFinder() as f:
  316. # Setup the defaults
  317. defaultinit(f)
  318. # that the cache got created
  319. self.assertTrue(cachedir.is_dir())
  320. # and that when hello.py is copied to the cache
  321. shutil.copy(self.fixtures / 'hello.py', cachedir)
  322. # it can be imported
  323. from cas.v1_f_330884aa2febb5e19fb7194ec6a69ed11dd3d77122f1a5175ee93e73cf0161c3 import hello
  324. with CASFinder() as f:
  325. defaultinit(f)
  326. # and that a new CASFinder can still find it
  327. from cas.v1_f_330884aa2febb5e19fb7194ec6a69ed11dd3d77122f1a5175ee93e73cf0161c3 import hello
  328. def test_multiplecas(self):
  329. # that once we have one
  330. with CASFinder() as f:
  331. # if we try to create a second, it fails
  332. self.assertRaises(RuntimeError, CASFinder)
  333. def test_parsealiases(self):
  334. with open(self.fixtures / 'randpkg' / 'cas_aliases.txt') as fp:
  335. aliasdata = fp.read()
  336. res = CASFinder._parsealiases(aliasdata)
  337. self.assertEqual(res, {
  338. 'hello': [
  339. 'hash://sha256/330884aa2febb5e19fb7194ec6a69ed11dd3d77122f1a5175ee93e73cf0161c3?type=text/x-python',
  340. 'ipfs://bafkreibtbcckul7lwxqz7nyzj3dknhwrdxj5o4jc6gsroxxjhzz46albym',
  341. ],
  342. 'hash://sha256/330884aa2febb5e19fb7194ec6a69ed11dd3d77122f1a5175ee93e73cf0161c3': [
  343. 'ipfs://bafkreibtbcckul7lwxqz7nyzj3dknhwrdxj5o4jc6gsroxxjhzz46albym',
  344. ],
  345. })
  346. def test_aliasimports(self):
  347. # setup the cache
  348. temphome = self.tempdir / 'home'
  349. temphome.mkdir()
  350. cachedir = temphome / '.casimport_cache'
  351. # add the test module's path
  352. fixdir = str(self.fixtures)
  353. sys.path.append(fixdir)
  354. with tempset(os.environ, 'HOME', str(temphome)):
  355. try:
  356. with CASFinder() as f, \
  357. tempattrset(sys.modules[__name__], 'load_aliases',
  358. f.load_aliases):
  359. defaultinit(f)
  360. # and that hello.py is in the cache
  361. shutil.copy(self.fixtures / 'hello.py', cachedir)
  362. # that the import is successful
  363. import randpkg
  364. # and pulled in the method
  365. self.assertTrue(hasattr(randpkg, 'hello'))
  366. del sys.modules['randpkg']
  367. finally:
  368. sys.path.remove(fixdir)
  369. def test_aliasipfsimports(self):
  370. # add the test module's path
  371. fixdir = str(self.fixtures)
  372. sys.path.append(fixdir)
  373. # that a fake ipfsloader
  374. with open(self.fixtures / 'hello.py') as fp:
  375. # that returns the correct data
  376. fakedata = fp.read()
  377. def fakeload(url, fd=fakedata):
  378. if url.scheme != 'ipfs' or url.netloc != 'bafkreibtbcckul7lwxqz7nyzj3dknhwrdxj5o4jc6gsroxxjhzz46albym':
  379. raise ValueError
  380. return fd
  381. fakeipfsloader = mock.MagicMock()
  382. fakeipfsloader.fetch_data = fakeload
  383. try:
  384. with CASFinder() as f, \
  385. tempattrset(sys.modules[__name__], 'load_aliases',
  386. f.load_aliases):
  387. f.register(fakeipfsloader)
  388. # that the import is successful
  389. import randpkg
  390. # and pulled in the method
  391. self.assertTrue(hasattr(randpkg, 'hello'))
  392. finally:
  393. sys.path.remove(fixdir)
  394. def test_overlappingaliases(self):
  395. # make sure that an aliases file is consistent and does not
  396. # override other urls. That is that any hashes are consistent,
  397. # and that they have at least one root hash that is the same, and
  398. # will be used for fetching.
  399. #
  400. # Likely will also have to deal w/ an issue where two aliases share
  401. # sha256, and a third shares sha512, which in this case, BOTH hashse
  402. # have to be checked.
  403. pass
  404. def test_loaderpriority(self):
  405. # XXX - write test to allow you to specify the priority of
  406. # a loader, to ensure that cache stays at top.
  407. # Maybe also think of a way to say local/remote, because
  408. # some loaders may be "more local" than others, like using
  409. # a local ipfs gateway makes more sense than hitting a
  410. # public gateway
  411. pass