Import python modules by their hash.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

696 lines
18 KiB

  1. # Copyright 2020 John-Mark Gurney.
  2. # All rights reserved.
  3. #
  4. # Redistribution and use in source and binary forms, with or without
  5. # modification, are permitted provided that the following conditions
  6. # are met:
  7. # 1. Redistributions of source code must retain the above copyright
  8. # notice, this list of conditions and the following disclaimer.
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  14. # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  15. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  16. # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  17. # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  18. # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  19. # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  20. # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  21. # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  22. # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  23. # SUCH DAMAGE.
  24. import contextlib
  25. import functools
  26. import glob
  27. import hashlib
  28. import importlib.resources
  29. import mock
  30. import os.path
  31. import pathlib
  32. import shutil
  33. import sys
  34. import tempfile
  35. import urllib.request
  36. from importlib.abc import MetaPathFinder, Loader
  37. from importlib.machinery import ModuleSpec
  38. def _printanyexc(f): # pragma: no cover
  39. '''Prints any exception that gets raised by the wrapped function.'''
  40. @functools.wraps(f)
  41. def wrapper(*args, **kwargs):
  42. try:
  43. return f(*args, **kwargs)
  44. except Exception:
  45. import traceback
  46. traceback.print_exc()
  47. raise
  48. return wrapper
  49. @contextlib.contextmanager
  50. def tempset(obj, key, value):
  51. '''A context (with) manager for changing the value of an item in a
  52. dictionary, and restoring it after the with block.
  53. Example usage:
  54. ```
  55. d = dict(a=5, b=10)
  56. with tempset(d, 'a', 15):
  57. print(repr(d['a'])
  58. print(repr(d['a'])
  59. ```
  60. '''
  61. try:
  62. oldvalue = obj[key]
  63. obj[key] = value
  64. yield
  65. finally:
  66. obj[key] = oldvalue
  67. @contextlib.contextmanager
  68. def tempattrset(obj, key, value):
  69. '''A context (with) manager for changing the value of an attribute
  70. of an object, and restoring it after the with block.
  71. If the attribute does not exist, it will be deleted afterward.
  72. Example usage:
  73. ```
  74. with tempattrset(someobj, 'a', 15):
  75. print(repr(someobj.a)
  76. print(repr(someobj.a)
  77. ```
  78. '''
  79. try:
  80. dodelattr = False
  81. if hasattr(obj, key):
  82. oldvalue = getattr(obj, key)
  83. else:
  84. dodelattr = True
  85. setattr(obj, key, value)
  86. yield
  87. finally:
  88. if not dodelattr:
  89. setattr(obj, key, oldvalue)
  90. else:
  91. delattr(obj, key)
  92. class HTTPSCAS(object):
  93. def fetch_data(self, url):
  94. if url.scheme != 'https':
  95. raise ValueError('cannot handle scheme %s' %
  96. repr(url.scheme))
  97. url = urllib.parse.urlunparse(url)
  98. with urllib.request.urlopen(url) as req:
  99. if req.status // 100 != 2:
  100. raise RuntimeError('bad fetch')
  101. return req.read()
  102. class IPFSCAS(object):
  103. gwhost = 'gateway.ipfs.io'
  104. gwhost = 'cloudflare-ipfs.com'
  105. def make_url(self, url):
  106. return urllib.parse.urlunparse(('https', self.gwhost,
  107. '/ipfs/' + url.netloc) + ('', ) * 3)
  108. def fetch_data(self, url):
  109. if url.scheme != 'ipfs':
  110. raise ValueError('cannot handle scheme %s' %
  111. repr(url.scheme))
  112. gwurl = self.make_url(url)
  113. with urllib.request.urlopen(gwurl) as req:
  114. if req.status // 100 != 2:
  115. raise RuntimeError('bad fetch')
  116. return req.read()
  117. class FileDirCAS(object):
  118. '''A file loader for CAS that operates on a directory. It looks
  119. at files, caches their hash, and loads them upon request.'''
  120. def __init__(self, path):
  121. self._path = pathlib.Path(path)
  122. self._hashes = {}
  123. def refresh_dir(self):
  124. '''Internal method to refresh the internal cache of
  125. hashes.'''
  126. for i in glob.glob(os.path.join(self._path, '*.py')):
  127. _, hash = self.read_hash_file(i)
  128. self._hashes[hash] = i
  129. @staticmethod
  130. def read_hash_file(fname):
  131. '''Helper function that will read the file at fname, and
  132. return the tuple of it's contents and it's hash.'''
  133. with open(fname, 'rb') as fp:
  134. data = fp.read()
  135. hash = hashlib.sha256(data).hexdigest()
  136. return data, hash
  137. def is_package(self, hash):
  138. '''Decode the provided hash, and decide if it's a package
  139. or not.'''
  140. return False
  141. def fetch_data(self, url):
  142. '''Given the URL (must be a hash URL), return the code for
  143. it.'''
  144. self.refresh_dir()
  145. hashurl = url
  146. if hashurl.scheme != 'hash' or hashurl.netloc != 'sha256':
  147. raise ValueError('invalid hash url')
  148. hash = hashurl.path[1:]
  149. fname = self._hashes[hash]
  150. data, fhash = self.read_hash_file(fname)
  151. if fhash != hash:
  152. raise ValueError('file no longer matches hash on disk')
  153. return data
  154. class CASFinder(MetaPathFinder, Loader):
  155. '''Overall class for using Content Addressable Storage to load
  156. Python modules into your code. It contains code to dispatch to
  157. the various loaders to attempt to load the hash.'''
  158. def __init__(self):
  159. self._loaders = []
  160. self._aliases = {}
  161. if [ x for x in sys.meta_path if
  162. isinstance(x, self.__class__) ]:
  163. raise RuntimeError(
  164. 'cannot register more than on CASFinder')
  165. sys.meta_path.append(self)
  166. def __enter__(self):
  167. return self
  168. def __exit__(self, exc_type, exc_value, traceback):
  169. self.disconnect()
  170. def load_mod_aliases(self, name):
  171. '''Load the aliases from the module with the passed in name.'''
  172. aliases = importlib.resources.read_text(sys.modules[name],
  173. 'cas_aliases.txt')
  174. self._aliases.update(self._parsealiases(aliases))
  175. @staticmethod
  176. def _makebasichashurl(url):
  177. try:
  178. hashurl = urllib.parse.urlparse(url)
  179. except AttributeError:
  180. hashurl = url
  181. return urllib.parse.urlunparse(hashurl[:3] + ('', '', ''))
  182. @classmethod
  183. def _parsealiases(cls, data):
  184. ret = {}
  185. lines = data.split('\n')
  186. for i in lines:
  187. if not i:
  188. continue
  189. name, hash = i.split()
  190. ret.setdefault(name, []).append(hash)
  191. # split out the hashes
  192. for items in list(ret.values()):
  193. lst = [ x for x in items if
  194. not x.startswith('hash://') ]
  195. for h in [ x for x in items if
  196. x.startswith('hash://') ]:
  197. h = cls._makebasichashurl(h)
  198. ret[h] = lst
  199. return ret
  200. def disconnect(self):
  201. '''Disconnect this Finder from being used to load modules.
  202. As this claims an entire namespace, only the first loaded
  203. one will work, and any others will be hidden until the
  204. first one is disconnected.
  205. This can be used w/ a with block to automatically
  206. disconnect when no longer needed. This is mostly useful
  207. for testing.'''
  208. try:
  209. sys.meta_path.remove(self)
  210. except ValueError:
  211. pass
  212. def register(self, loader):
  213. '''Register a loader w/ this finder. This will attempt
  214. to load the hash passed to it. It is also (currently)
  215. responsible for executing the code in the module.'''
  216. self._loaders.append(loader)
  217. # MetaPathFinder methods
  218. def find_spec(self, fullname, path, target=None):
  219. if path is None:
  220. ms = ModuleSpec(fullname, self, is_package=True)
  221. else:
  222. parts = fullname.split('.')
  223. ver, typ, arg = parts[1].split('_')
  224. if typ == 'f':
  225. # make hash url:
  226. hashurl = ('hash://sha256/%s' %
  227. bytes.fromhex(arg).hex())
  228. hashurl = urllib.parse.urlparse(hashurl)
  229. for l in self._loaders:
  230. ispkg = l.is_package(hashurl)
  231. break
  232. else:
  233. return None
  234. else:
  235. # an alias
  236. for i in self._aliases[arg]:
  237. hashurl = urllib.parse.urlparse(i)
  238. if hashurl.scheme == 'hash':
  239. break
  240. else:
  241. raise ValueError('unable to find bash hash url for alias %s' % repr(arg))
  242. ms = ModuleSpec(fullname, self, is_package=False,
  243. loader_state=(hashurl,))
  244. return ms
  245. def invalidate_caches(self):
  246. return None
  247. # Loader methods
  248. def exec_module(self, module):
  249. if module.__name__ == 'cas':
  250. pass
  251. else:
  252. (url,) = module.__spec__.loader_state
  253. for load in self._loaders:
  254. try:
  255. data = load.fetch_data(url)
  256. break
  257. except Exception:
  258. pass
  259. else:
  260. for url in self._aliases[
  261. self._makebasichashurl(url)]:
  262. url = urllib.parse.urlparse(url)
  263. for load in self._loaders:
  264. try:
  265. data = load.fetch_data(url)
  266. break
  267. except Exception:
  268. pass
  269. else:
  270. continue
  271. break
  272. else:
  273. raise ValueError('unable to find loader for url %s' % repr(urllib.parse.urlunparse(url)))
  274. exec(data, module.__dict__)
  275. def defaultinit(casf):
  276. cachedir = pathlib.Path.home() / '.casimport_cache'
  277. cachedir.mkdir(exist_ok=True)
  278. casf.register(FileDirCAS(cachedir))
  279. casf.register(IPFSCAS())
  280. casf.register(HTTPSCAS())
  281. # The global version
  282. _casfinder = CASFinder()
  283. load_mod_aliases = _casfinder.load_mod_aliases
  284. defaultinit(_casfinder)
  285. import unittest
  286. class TestHelpers(unittest.TestCase):
  287. def test_testset(self):
  288. origobj = object()
  289. d = dict(a=origobj, b=10)
  290. # that when we temporarily set it
  291. with tempset(d, 'a', 15):
  292. # the new value is there
  293. self.assertEqual(d['a'], 15)
  294. # and that the original object is restored
  295. self.assertIs(d['a'], origobj)
  296. def test_testattrset(self):
  297. class TestObj(object):
  298. pass
  299. testobj = TestObj()
  300. # that when we temporarily set it
  301. with tempattrset(testobj, 'a', 15):
  302. # the new value is there
  303. self.assertEqual(testobj.a, 15)
  304. # and that there is no object
  305. self.assertFalse(hasattr(testobj, 'a'))
  306. origobj = object()
  307. newobj = object()
  308. testobj.b = origobj
  309. # that when we temporarily set it
  310. with tempattrset(testobj, 'b', newobj):
  311. # the new value is there
  312. self.assertIs(testobj.b, newobj)
  313. # and the original value is restored
  314. self.assertIs(testobj.b, origobj)
  315. class Test(unittest.TestCase):
  316. def setUp(self):
  317. # clear out the default casfinder if there is one
  318. self.old_meta_path = sys.meta_path
  319. sys.meta_path = [ x for x in sys.meta_path if
  320. not isinstance(x, CASFinder) ]
  321. # setup temporary directory
  322. d = pathlib.Path(os.path.realpath(tempfile.mkdtemp()))
  323. self.basetempdir = d
  324. self.tempdir = d / 'subdir'
  325. self.tempdir.mkdir()
  326. self.fixtures = \
  327. pathlib.Path(__file__).parent.parent / 'fixtures'
  328. def tearDown(self):
  329. # restore environment
  330. sys.meta_path = self.old_meta_path
  331. importlib.invalidate_caches()
  332. # clean up sys.modules
  333. [ sys.modules.pop(x) for x in list(sys.modules.keys()) if
  334. x == 'cas' or x.startswith('cas.') ]
  335. shutil.rmtree(self.basetempdir)
  336. self.tempdir = None
  337. def test_filedircas_limit_refresh(self):
  338. # XXX - only refresh when the dir has changed, and each
  339. # file has changed
  340. pass
  341. def test_casimport(self):
  342. # That a CASFinder
  343. f = CASFinder()
  344. # make sure that we can't import anything at first
  345. with self.assertRaises(ImportError):
  346. import cas.v1_f_2398472398
  347. # when registering the fixtures directory
  348. f.register(FileDirCAS(self.fixtures))
  349. # can import the function
  350. from cas.v1_f_330884aa2febb5e19fb7194ec6a69ed11dd3d77122f1a5175ee93e73cf0161c3 import hello
  351. name = 'Olof'
  352. # and run the code
  353. self.assertEqual(hello(name), 'hello ' + name)
  354. # and when finished, can disconnect
  355. f.disconnect()
  356. # and is no longer in the meta_path
  357. self.assertNotIn(f, sys.meta_path)
  358. # and when disconnected as second time, nothing happens
  359. f.disconnect()
  360. def test_defaultinit(self):
  361. temphome = self.tempdir / 'home'
  362. temphome.mkdir()
  363. cachedir = temphome / '.casimport_cache'
  364. with tempset(os.environ, 'HOME', str(temphome)):
  365. with CASFinder() as f:
  366. # Setup the defaults
  367. defaultinit(f)
  368. # that the cache got created
  369. self.assertTrue(cachedir.is_dir())
  370. # and that when hello.py is copied to the cache
  371. shutil.copy(self.fixtures / 'hello.py', cachedir)
  372. # it can be imported
  373. from cas.v1_f_330884aa2febb5e19fb7194ec6a69ed11dd3d77122f1a5175ee93e73cf0161c3 import hello
  374. # and that the second loader is the IPFSCAS
  375. self.assertIsInstance(f._loaders[1], IPFSCAS)
  376. # and that the third loader is the HTTPSCAS
  377. self.assertIsInstance(f._loaders[2], HTTPSCAS)
  378. with CASFinder() as f:
  379. defaultinit(f)
  380. # and that a new CASFinder can still find it
  381. from cas.v1_f_330884aa2febb5e19fb7194ec6a69ed11dd3d77122f1a5175ee93e73cf0161c3 import hello
  382. def test_multiplecas(self):
  383. # that once we have one
  384. with CASFinder() as f:
  385. # if we try to create a second, it fails
  386. self.assertRaises(RuntimeError, CASFinder)
  387. def test_parsealiases(self):
  388. with open(self.fixtures / 'randpkg' / 'cas_aliases.txt') as fp:
  389. aliasdata = fp.read()
  390. res = CASFinder._parsealiases(aliasdata)
  391. self.assertEqual(res, {
  392. 'hello': [
  393. 'hash://sha256/330884aa2febb5e19fb7194ec6a69ed11dd3d77122f1a5175ee93e73cf0161c3?type=text/x-python',
  394. 'ipfs://bafkreibtbcckul7lwxqz7nyzj3dknhwrdxj5o4jc6gsroxxjhzz46albym',
  395. 'https://www.funkthat.com/gitea/jmg/casimport/raw/commit/753e64f53c73d9d1afc4d8a617edb9d3542dcea2/fixtures/hello.py',
  396. ],
  397. 'hash://sha256/330884aa2febb5e19fb7194ec6a69ed11dd3d77122f1a5175ee93e73cf0161c3': [
  398. 'ipfs://bafkreibtbcckul7lwxqz7nyzj3dknhwrdxj5o4jc6gsroxxjhzz46albym',
  399. 'https://www.funkthat.com/gitea/jmg/casimport/raw/commit/753e64f53c73d9d1afc4d8a617edb9d3542dcea2/fixtures/hello.py',
  400. ],
  401. })
  402. def test_aliasimports(self):
  403. # setup the cache
  404. temphome = self.tempdir / 'home'
  405. temphome.mkdir()
  406. cachedir = temphome / '.casimport_cache'
  407. # add the test module's path
  408. fixdir = str(self.fixtures)
  409. sys.path.append(fixdir)
  410. with tempset(os.environ, 'HOME', str(temphome)):
  411. try:
  412. with CASFinder() as f, \
  413. tempattrset(sys.modules[__name__],
  414. 'load_mod_aliases', f.load_mod_aliases):
  415. defaultinit(f)
  416. # and that hello.py is in the cache
  417. shutil.copy(self.fixtures / 'hello.py',
  418. cachedir)
  419. # that the import is successful
  420. import randpkg
  421. # and pulled in the method
  422. self.assertTrue(hasattr(randpkg, 'hello'))
  423. del sys.modules['randpkg']
  424. finally:
  425. sys.path.remove(fixdir)
  426. def test_aliasipfsimports(self):
  427. # add the test module's path
  428. fixdir = str(self.fixtures)
  429. sys.path.append(fixdir)
  430. # that a fake ipfsloader
  431. with open(self.fixtures / 'hello.py') as fp:
  432. # that returns the correct data
  433. fakedata = fp.read()
  434. def fakeload(url, fd=fakedata):
  435. if url.scheme != 'ipfs' or url.netloc != 'bafkreibtbcckul7lwxqz7nyzj3dknhwrdxj5o4jc6gsroxxjhzz46albym':
  436. raise ValueError
  437. return fd
  438. fakeipfsloader = mock.MagicMock()
  439. fakeipfsloader.fetch_data = fakeload
  440. try:
  441. with CASFinder() as f, \
  442. tempattrset(sys.modules[__name__], 'load_mod_aliases',
  443. f.load_mod_aliases):
  444. f.register(fakeipfsloader)
  445. # that the import is successful
  446. import randpkg
  447. # and pulled in the method
  448. self.assertTrue(hasattr(randpkg, 'hello'))
  449. finally:
  450. sys.path.remove(fixdir)
  451. @mock.patch('urllib.request.urlopen')
  452. def test_ipfscasloader(self, uomock):
  453. # prep return test data
  454. with open(self.fixtures / 'hello.py') as fp:
  455. # that returns the correct data
  456. ipfsdata = fp.read()
  457. # that the ipfs CAS loader
  458. ipfs = IPFSCAS()
  459. # that the request is successfull
  460. uomock.return_value.__enter__.return_value.status = 200
  461. # and returns the correct data
  462. uomock.return_value.__enter__.return_value.read.return_value = ipfsdata
  463. # that when called
  464. hashurl = urllib.parse.urlparse('ipfs://bafkreibtbcckul7lwxqz7nyzj3dknhwrdxj5o4jc6gsroxxjhzz46albym')
  465. data = ipfs.fetch_data(hashurl)
  466. # it opens the correct url
  467. uomock.assert_called_with('https://cloudflare-ipfs.com/ipfs/bafkreibtbcckul7lwxqz7nyzj3dknhwrdxj5o4jc6gsroxxjhzz46albym')
  468. # and returns the correct data
  469. self.assertEqual(data, ipfsdata)
  470. with self.assertRaises(ValueError):
  471. # that a hash url fails
  472. ipfs.fetch_data(urllib.parse.urlparse('hash://sha256/asldfkj'))
  473. # that when the request fails
  474. uomock.return_value.__enter__.return_value.status = 400
  475. # it raises a RuntimeError
  476. with self.assertRaises(RuntimeError):
  477. ipfs.fetch_data(hashurl)
  478. # Note: mostly copied from above, test_ipfscasloader
  479. @mock.patch('urllib.request.urlopen')
  480. def test_httpscasloader(self, uomock):
  481. # prep return test data
  482. with open(self.fixtures / 'hello.py') as fp:
  483. # that returns the correct data
  484. httpsdata = fp.read()
  485. # that the https CAS loader
  486. httpsldr = HTTPSCAS()
  487. # that the request is successfull
  488. uomock.return_value.__enter__.return_value.status = 200
  489. # and returns the correct data
  490. uomock.return_value.__enter__.return_value.read.return_value = httpsdata
  491. # that when called
  492. hashurl = urllib.parse.urlparse('https://www.funkthat.com/gitea/jmg/casimport/raw/commit/753e64f53c73d9d1afc4d8a617edb9d3542dcea2/fixtures/hello.py')
  493. data = httpsldr.fetch_data(hashurl)
  494. # it opens the correct url
  495. uomock.assert_called_with('https://www.funkthat.com/gitea/jmg/casimport/raw/commit/753e64f53c73d9d1afc4d8a617edb9d3542dcea2/fixtures/hello.py')
  496. # and returns the correct data
  497. self.assertEqual(data, httpsdata)
  498. with self.assertRaises(ValueError):
  499. # that a hash url fails
  500. httpsldr.fetch_data(urllib.parse.urlparse('hash://sha256/asldfkj'))
  501. # that when the request fails
  502. uomock.return_value.__enter__.return_value.status = 400
  503. # it raises a RuntimeError
  504. with self.assertRaises(RuntimeError):
  505. httpsldr.fetch_data(hashurl)
  506. def test_overlappingaliases(self):
  507. # make sure that an aliases file is consistent and does not
  508. # override other urls. That is that any hashes are
  509. # consistent, and that they have at least one root hash that
  510. # is the same, and will be used for fetching.
  511. #
  512. # Likely will also have to deal w/ an issue where two
  513. # aliases share sha256, and a third shares sha512, which in
  514. # this case, BOTH hashse have to be checked.
  515. pass
  516. def test_loaderpriority(self):
  517. # XXX - write test to allow you to specify the priority of
  518. # a loader, to ensure that cache stays at top.
  519. # Maybe also think of a way to say local/remote, because
  520. # some loaders may be "more local" than others, like using
  521. # a local ipfs gateway makes more sense than hitting a
  522. # public gateway
  523. pass
  524. def test_filecorruption(self):
  525. cachedir = self.tempdir / 'cachedir'
  526. cachedir.mkdir()
  527. # that an existing file
  528. shutil.copy(self.fixtures / 'hello.py', cachedir)
  529. # is in the cache
  530. fdcas = FileDirCAS(cachedir)
  531. # that when refresh is surpressed
  532. fdcas.refresh_dir = lambda: None
  533. # and has a bogus hash
  534. fdcas._hashes['0000'] = cachedir / 'hello.py'
  535. # that when read raises an exception
  536. with self.assertRaises(ValueError):
  537. fdcas.fetch_data(urllib.parse.urlparse('hash://sha256/0000'))
  538. # that when passed an invalid url
  539. with self.assertRaises(ValueError):
  540. fdcas.fetch_data(urllib.parse.urlparse('https://sha256/0000'))