From 62f507912229a8dcf7cf538eab44ec7d7417a290 Mon Sep 17 00:00:00 2001 From: John-Mark Gurney Date: Mon, 22 Aug 2022 17:57:51 -0700 Subject: [PATCH 01/11] add initial version of bencode from BitTornado.. the license file text was imported.. --- bencode.py | 346 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 346 insertions(+) create mode 100644 bencode.py diff --git a/bencode.py b/bencode.py new file mode 100644 index 0000000..c5c5d60 --- /dev/null +++ b/bencode.py @@ -0,0 +1,346 @@ +# Written by Petru Paler, Uoti Urpala, Ross Cohen and John Hoffman +# see LICENSE.txt for license information + +# LICENSE.txt: +# Unless otherwise noted, all files are released under the MIT +# license, exceptions contain licensing information in them. +# +# Copyright (C) 2001-2002 Bram Cohen +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation files +# (the "Software"), to deal in the Software without restriction, +# including without limitation the rights to use, copy, modify, merge, +# publish, distribute, sublicense, and/or sell copies of the Software, +# and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# The Software is provided "AS IS", without warranty of any kind, +# express or implied, including but not limited to the warranties of +# merchantability, fitness for a particular purpose and +# noninfringement. In no event shall the authors or copyright holders +# be liable for any claim, damages or other liability, whether in an +# action of contract, tort or otherwise, arising from, out of or in +# connection with the Software or the use or other dealings in the +# Software. + + +from types import IntType, LongType, StringType, ListType, TupleType, DictType +try: + from types import BooleanType +except ImportError: + BooleanType = None +try: + from types import UnicodeType +except ImportError: + UnicodeType = None +from cStringIO import StringIO + +def decode_int(x, f): + f += 1 + newf = x.index('e', f) + try: + n = int(x[f:newf]) + except: + n = long(x[f:newf]) + if x[f] == '-': + if x[f + 1] == '0': + raise ValueError + elif x[f] == '0' and newf != f+1: + raise ValueError + return (n, newf+1) + +def decode_string(x, f): + colon = x.index(':', f) + try: + n = int(x[f:colon]) + except (OverflowError, ValueError): + n = long(x[f:colon]) + if x[f] == '0' and colon != f+1: + raise ValueError + colon += 1 + return (x[colon:colon+n], colon+n) + +def decode_unicode(x, f): + s, f = decode_string(x, f+1) + return (s.decode('UTF-8'),f) + +def decode_list(x, f): + r, f = [], f+1 + while x[f] != 'e': + v, f = decode_func[x[f]](x, f) + r.append(v) + return (r, f + 1) + +def decode_dict(x, f): + r, f = {}, f+1 + lastkey = None + while x[f] != 'e': + k, f = decode_string(x, f) + if lastkey >= k: + raise ValueError + lastkey = k + r[k], f = decode_func[x[f]](x, f) + return (r, f + 1) + +decode_func = {} +decode_func['l'] = decode_list +decode_func['d'] = decode_dict +decode_func['i'] = decode_int +decode_func['0'] = decode_string +decode_func['1'] = decode_string +decode_func['2'] = decode_string +decode_func['3'] = decode_string +decode_func['4'] = decode_string +decode_func['5'] = decode_string +decode_func['6'] = decode_string +decode_func['7'] = decode_string +decode_func['8'] = decode_string +decode_func['9'] = decode_string +#decode_func['u'] = decode_unicode + +def bdecode(x, sloppy = 0): + try: + r, l = decode_func[x[0]](x, 0) +# except (IndexError, KeyError): + except (IndexError, KeyError, ValueError): + raise ValueError, "bad bencoded data" + if not sloppy and l != len(x): + raise ValueError, "bad bencoded data" + return r + +def test_bdecode(): + try: + bdecode('0:0:') + assert 0 + except ValueError: + pass + try: + bdecode('ie') + assert 0 + except ValueError: + pass + try: + bdecode('i341foo382e') + assert 0 + except ValueError: + pass + assert bdecode('i4e') == 4L + assert bdecode('i0e') == 0L + assert bdecode('i123456789e') == 123456789L + assert bdecode('i-10e') == -10L + try: + bdecode('i-0e') + assert 0 + except ValueError: + pass + try: + bdecode('i123') + assert 0 + except ValueError: + pass + try: + bdecode('') + assert 0 + except ValueError: + pass + try: + bdecode('i6easd') + assert 0 + except ValueError: + pass + try: + bdecode('35208734823ljdahflajhdf') + assert 0 + except ValueError: + pass + try: + bdecode('2:abfdjslhfld') + assert 0 + except ValueError: + pass + assert bdecode('0:') == '' + assert bdecode('3:abc') == 'abc' + assert bdecode('10:1234567890') == '1234567890' + try: + bdecode('02:xy') + assert 0 + except ValueError: + pass + try: + bdecode('l') + assert 0 + except ValueError: + pass + assert bdecode('le') == [] + try: + bdecode('leanfdldjfh') + assert 0 + except ValueError: + pass + assert bdecode('l0:0:0:e') == ['', '', ''] + try: + bdecode('relwjhrlewjh') + assert 0 + except ValueError: + pass + assert bdecode('li1ei2ei3ee') == [1, 2, 3] + assert bdecode('l3:asd2:xye') == ['asd', 'xy'] + assert bdecode('ll5:Alice3:Bobeli2ei3eee') == [['Alice', 'Bob'], [2, 3]] + try: + bdecode('d') + assert 0 + except ValueError: + pass + try: + bdecode('defoobar') + assert 0 + except ValueError: + pass + assert bdecode('de') == {} + assert bdecode('d3:agei25e4:eyes4:bluee') == {'age': 25, 'eyes': 'blue'} + assert bdecode('d8:spam.mp3d6:author5:Alice6:lengthi100000eee') == {'spam.mp3': {'author': 'Alice', 'length': 100000}} + try: + bdecode('d3:fooe') + assert 0 + except ValueError: + pass + try: + bdecode('di1e0:e') + assert 0 + except ValueError: + pass + try: + bdecode('d1:b0:1:a0:e') + assert 0 + except ValueError: + pass + try: + bdecode('d1:a0:1:a0:e') + assert 0 + except ValueError: + pass + try: + bdecode('i03e') + assert 0 + except ValueError: + pass + try: + bdecode('l01:ae') + assert 0 + except ValueError: + pass + try: + bdecode('9999:x') + assert 0 + except ValueError: + pass + try: + bdecode('l0:') + assert 0 + except ValueError: + pass + try: + bdecode('d0:0:') + assert 0 + except ValueError: + pass + try: + bdecode('d0:') + assert 0 + except ValueError: + pass + +bencached_marker = [] + +class Bencached: + def __init__(self, s): + self.marker = bencached_marker + self.bencoded = s + +BencachedType = type(Bencached('')) # insufficient, but good as a filter + +def encode_bencached(x,r): + assert x.marker == bencached_marker + r.append(x.bencoded) + +def encode_int(x,r): + r.extend(('i',str(x),'e')) + +def encode_bool(x,r): + encode_int(int(x),r) + +def encode_string(x,r): + r.extend((str(len(x)),':',x)) + +def encode_unicode(x,r): + #r.append('u') + encode_string(x.encode('UTF-8'),r) + +def encode_list(x,r): + r.append('l') + for e in x: + encode_func[type(e)](e, r) + r.append('e') + +def encode_dict(x,r): + r.append('d') + ilist = x.items() + ilist.sort() + for k,v in ilist: + r.extend((str(len(k)),':',k)) + encode_func[type(v)](v, r) + r.append('e') + +encode_func = {} +encode_func[BencachedType] = encode_bencached +encode_func[IntType] = encode_int +encode_func[LongType] = encode_int +encode_func[StringType] = encode_string +encode_func[ListType] = encode_list +encode_func[TupleType] = encode_list +encode_func[DictType] = encode_dict +if BooleanType: + encode_func[BooleanType] = encode_bool +if UnicodeType: + encode_func[UnicodeType] = encode_unicode + +def bencode(x): + r = [] + try: + encode_func[type(x)](x, r) + except: + print "*** error *** could not encode type %s (value: %s)" % (type(x), x) + assert 0 + return ''.join(r) + +def test_bencode(): + assert bencode(4) == 'i4e' + assert bencode(0) == 'i0e' + assert bencode(-10) == 'i-10e' + assert bencode(12345678901234567890L) == 'i12345678901234567890e' + assert bencode('') == '0:' + assert bencode('abc') == '3:abc' + assert bencode('1234567890') == '10:1234567890' + assert bencode([]) == 'le' + assert bencode([1, 2, 3]) == 'li1ei2ei3ee' + assert bencode([['Alice', 'Bob'], [2, 3]]) == 'll5:Alice3:Bobeli2ei3eee' + assert bencode({}) == 'de' + assert bencode({'age': 25, 'eyes': 'blue'}) == 'd3:agei25e4:eyes4:bluee' + assert bencode({'spam.mp3': {'author': 'Alice', 'length': 100000}}) == 'd8:spam.mp3d6:author5:Alice6:lengthi100000eee' + try: + bencode({1: 'foo'}) + assert 0 + except AssertionError: + pass + + +try: + import psyco + psyco.bind(bdecode) + psyco.bind(bencode) +except ImportError: + pass From 20f312a2b1e100bfd90763424cf39f252960324b Mon Sep 17 00:00:00 2001 From: John-Mark Gurney Date: Tue, 23 Aug 2022 08:55:39 -0700 Subject: [PATCH 02/11] Make the python3 compliant. This doesn't deal w/ unicode yet --- bencode.py | 224 ++++++++++++++++++++++++++--------------------------- 1 file changed, 109 insertions(+), 115 deletions(-) diff --git a/bencode.py b/bencode.py index c5c5d60..3a64056 100644 --- a/bencode.py +++ b/bencode.py @@ -28,7 +28,6 @@ # Software. -from types import IntType, LongType, StringType, ListType, TupleType, DictType try: from types import BooleanType except ImportError: @@ -37,29 +36,23 @@ try: from types import UnicodeType except ImportError: UnicodeType = None -from cStringIO import StringIO +from io import StringIO def decode_int(x, f): f += 1 - newf = x.index('e', f) - try: - n = int(x[f:newf]) - except: - n = long(x[f:newf]) - if x[f] == '-': - if x[f + 1] == '0': + newf = x.index(b'e', f) + n = int(x[f:newf]) + if x[f] == b'-'[0]: + if x[f + 1] == b'0'[0]: raise ValueError - elif x[f] == '0' and newf != f+1: + elif x[f] == b'0'[0] and newf != f+1: raise ValueError return (n, newf+1) - + def decode_string(x, f): - colon = x.index(':', f) - try: - n = int(x[f:colon]) - except (OverflowError, ValueError): - n = long(x[f:colon]) - if x[f] == '0' and colon != f+1: + colon = x.index(b':', f) + n = int(x[f:colon]) + if x[f] == b'0'[0] and colon != f+1: raise ValueError colon += 1 return (x[colon:colon+n], colon+n) @@ -70,15 +63,15 @@ def decode_unicode(x, f): def decode_list(x, f): r, f = [], f+1 - while x[f] != 'e': + while x[f] != b'e'[0]: v, f = decode_func[x[f]](x, f) r.append(v) return (r, f + 1) def decode_dict(x, f): r, f = {}, f+1 - lastkey = None - while x[f] != 'e': + lastkey = b'' + while x[f] != b'e'[0]: k, f = decode_string(x, f) if lastkey >= k: raise ValueError @@ -87,58 +80,63 @@ def decode_dict(x, f): return (r, f + 1) decode_func = {} -decode_func['l'] = decode_list -decode_func['d'] = decode_dict -decode_func['i'] = decode_int -decode_func['0'] = decode_string -decode_func['1'] = decode_string -decode_func['2'] = decode_string -decode_func['3'] = decode_string -decode_func['4'] = decode_string -decode_func['5'] = decode_string -decode_func['6'] = decode_string -decode_func['7'] = decode_string -decode_func['8'] = decode_string -decode_func['9'] = decode_string -#decode_func['u'] = decode_unicode - +decode_func[b'l'[0]] = decode_list +decode_func[b'd'[0]] = decode_dict +decode_func[b'i'[0]] = decode_int +decode_func[b'0'[0]] = decode_string +decode_func[b'1'[0]] = decode_string +decode_func[b'2'[0]] = decode_string +decode_func[b'3'[0]] = decode_string +decode_func[b'4'[0]] = decode_string +decode_func[b'5'[0]] = decode_string +decode_func[b'6'[0]] = decode_string +decode_func[b'7'[0]] = decode_string +decode_func[b'8'[0]] = decode_string +decode_func[b'9'[0]] = decode_string +#decode_func['u'[0]] = decode_unicode + def bdecode(x, sloppy = 0): try: r, l = decode_func[x[0]](x, 0) # except (IndexError, KeyError): except (IndexError, KeyError, ValueError): - raise ValueError, "bad bencoded data" + raise ValueError("bad bencoded data") if not sloppy and l != len(x): - raise ValueError, "bad bencoded data" + raise ValueError("bad bencoded data") return r def test_bdecode(): try: - bdecode('0:0:') + bdecode(b'0:0:') + assert 0 + except ValueError: + pass + try: + bdecode(b'ie') assert 0 except ValueError: pass try: - bdecode('ie') + bdecode(b'i341foo382e') assert 0 except ValueError: pass + assert bdecode(b'i4e') == 4 + assert bdecode(b'i0e') == 0 + assert bdecode(b'i123456789e') == 123456789 + assert bdecode(b'i-10e') == -10 try: - bdecode('i341foo382e') + bdecode(b'i-0e') assert 0 except ValueError: pass - assert bdecode('i4e') == 4L - assert bdecode('i0e') == 0L - assert bdecode('i123456789e') == 123456789L - assert bdecode('i-10e') == -10L try: - bdecode('i-0e') + bdecode(b'i123') assert 0 except ValueError: pass try: - bdecode('i123') + bdecode(b'') assert 0 except ValueError: pass @@ -148,108 +146,108 @@ def test_bdecode(): except ValueError: pass try: - bdecode('i6easd') + bdecode(b'i6easd') assert 0 except ValueError: pass try: - bdecode('35208734823ljdahflajhdf') + bdecode(b'35208734823ljdahflajhdf') assert 0 except ValueError: pass try: - bdecode('2:abfdjslhfld') + bdecode(b'2:abfdjslhfld') assert 0 except ValueError: pass - assert bdecode('0:') == '' - assert bdecode('3:abc') == 'abc' - assert bdecode('10:1234567890') == '1234567890' + assert bdecode(b'0:') == b'' + assert bdecode(b'3:abc') == b'abc' + assert bdecode(b'10:1234567890') == b'1234567890' try: - bdecode('02:xy') + bdecode(b'02:xy') assert 0 except ValueError: pass try: - bdecode('l') + bdecode(b'l') assert 0 except ValueError: pass - assert bdecode('le') == [] + assert bdecode(b'le') == [] try: - bdecode('leanfdldjfh') + bdecode(b'leanfdldjfh') assert 0 except ValueError: pass - assert bdecode('l0:0:0:e') == ['', '', ''] + assert bdecode(b'l0:0:0:e') == [b'', b'', b''] try: - bdecode('relwjhrlewjh') + bdecode(b'relwjhrlewjh') assert 0 except ValueError: pass - assert bdecode('li1ei2ei3ee') == [1, 2, 3] - assert bdecode('l3:asd2:xye') == ['asd', 'xy'] - assert bdecode('ll5:Alice3:Bobeli2ei3eee') == [['Alice', 'Bob'], [2, 3]] + assert bdecode(b'li1ei2ei3ee') == [1, 2, 3] + assert bdecode(b'l3:asd2:xye') == [b'asd', b'xy'] + assert bdecode(b'll5:Alice3:Bobeli2ei3eee') == [[b'Alice', b'Bob'], [2, 3]] try: - bdecode('d') + bdecode(b'd') assert 0 except ValueError: pass try: - bdecode('defoobar') + bdecode(b'defoobar') assert 0 except ValueError: pass - assert bdecode('de') == {} - assert bdecode('d3:agei25e4:eyes4:bluee') == {'age': 25, 'eyes': 'blue'} - assert bdecode('d8:spam.mp3d6:author5:Alice6:lengthi100000eee') == {'spam.mp3': {'author': 'Alice', 'length': 100000}} + assert bdecode(b'de') == {} + assert bdecode(b'd3:agei25e4:eyes4:bluee') == {b'age': 25, b'eyes': b'blue'} + assert bdecode(b'd8:spam.mp3d6:author5:Alice6:lengthi100000eee') == {b'spam.mp3': {b'author': b'Alice', b'length': 100000}} try: - bdecode('d3:fooe') + bdecode(b'd3:fooe') assert 0 except ValueError: pass try: - bdecode('di1e0:e') + bdecode(b'di1e0:e') assert 0 except ValueError: pass try: - bdecode('d1:b0:1:a0:e') + bdecode(b'd1:b0:1:a0:e') assert 0 except ValueError: pass try: - bdecode('d1:a0:1:a0:e') + bdecode(b'd1:a0:1:a0:e') assert 0 except ValueError: pass try: - bdecode('i03e') + bdecode(b'i03e') assert 0 except ValueError: pass try: - bdecode('l01:ae') + bdecode(b'l01:ae') assert 0 except ValueError: pass try: - bdecode('9999:x') + bdecode(b'9999:x') assert 0 except ValueError: pass try: - bdecode('l0:') + bdecode(b'l0:') assert 0 except ValueError: pass try: - bdecode('d0:0:') + bdecode(b'd0:0:') assert 0 except ValueError: pass try: - bdecode('d0:') + bdecode(b'd0:') assert 0 except ValueError: pass @@ -261,83 +259,79 @@ class Bencached: self.marker = bencached_marker self.bencoded = s -BencachedType = type(Bencached('')) # insufficient, but good as a filter +BencachedType = type(Bencached(b'')) # insufficient, but good as a filter def encode_bencached(x,r): assert x.marker == bencached_marker r.append(x.bencoded) def encode_int(x,r): - r.extend(('i',str(x),'e')) + r.append(b'i%de' % x) def encode_bool(x,r): encode_int(int(x),r) -def encode_string(x,r): - r.extend((str(len(x)),':',x)) +def encode_bytes(x,r): + r.extend((b'%d:' % len(x),x)) -def encode_unicode(x,r): +def encode_string(x,r): #r.append('u') - encode_string(x.encode('UTF-8'),r) + encode_bytes(x.encode('UTF-8'),r) def encode_list(x,r): - r.append('l') + r.append(b'l') for e in x: encode_func[type(e)](e, r) - r.append('e') + r.append(b'e') def encode_dict(x,r): - r.append('d') - ilist = x.items() - ilist.sort() - for k,v in ilist: - r.extend((str(len(k)),':',k)) + r.append(b'd') + for k,v in sorted(x.items()): + r.extend((b'%d:' % len(k),k.encode('UTF-8'))) encode_func[type(v)](v, r) - r.append('e') + r.append(b'e') encode_func = {} encode_func[BencachedType] = encode_bencached -encode_func[IntType] = encode_int -encode_func[LongType] = encode_int -encode_func[StringType] = encode_string -encode_func[ListType] = encode_list -encode_func[TupleType] = encode_list -encode_func[DictType] = encode_dict +encode_func[int] = encode_int +encode_func[str] = encode_string +encode_func[list] = encode_list +encode_func[tuple] = encode_list +encode_func[type({})] = encode_dict if BooleanType: encode_func[BooleanType] = encode_bool if UnicodeType: encode_func[UnicodeType] = encode_unicode - + def bencode(x): r = [] try: encode_func[type(x)](x, r) except: - print "*** error *** could not encode type %s (value: %s)" % (type(x), x) - assert 0 - return ''.join(r) + raise ValueError("could not encode type %s (value: %s)" % (type(x), x)) + return b''.join(r) def test_bencode(): - assert bencode(4) == 'i4e' - assert bencode(0) == 'i0e' - assert bencode(-10) == 'i-10e' - assert bencode(12345678901234567890L) == 'i12345678901234567890e' - assert bencode('') == '0:' - assert bencode('abc') == '3:abc' - assert bencode('1234567890') == '10:1234567890' - assert bencode([]) == 'le' - assert bencode([1, 2, 3]) == 'li1ei2ei3ee' - assert bencode([['Alice', 'Bob'], [2, 3]]) == 'll5:Alice3:Bobeli2ei3eee' - assert bencode({}) == 'de' - assert bencode({'age': 25, 'eyes': 'blue'}) == 'd3:agei25e4:eyes4:bluee' - assert bencode({'spam.mp3': {'author': 'Alice', 'length': 100000}}) == 'd8:spam.mp3d6:author5:Alice6:lengthi100000eee' + assert bencode(4) == b'i4e' + assert bencode(0) == b'i0e' + assert bencode(-10) == b'i-10e' + assert bencode(12345678901234567890) == b'i12345678901234567890e' + assert bencode('') == b'0:' + assert bencode('abc') == b'3:abc' + assert bencode('1234567890') == b'10:1234567890' + assert bencode([]) == b'le' + assert bencode([1, 2, 3]) == b'li1ei2ei3ee' + assert bencode([['Alice', 'Bob'], [2, 3]]) == b'll5:Alice3:Bobeli2ei3eee' + assert bencode({}) == b'de' + assert bencode({'age': 25, 'eyes': 'blue'}) == b'd3:agei25e4:eyes4:bluee' + assert bencode({'spam.mp3': {'author': 'Alice', 'length': 100000}}) == b'd8:spam.mp3d6:author5:Alice6:lengthi100000eee' try: bencode({1: 'foo'}) assert 0 - except AssertionError: + except (ValueError, AssertionError): pass - + try: import psyco psyco.bind(bdecode) From eb26d1d04fcae610f406e1aaa6c14b62804798f2 Mon Sep 17 00:00:00 2001 From: John-Mark Gurney Date: Tue, 23 Aug 2022 14:05:56 -0700 Subject: [PATCH 03/11] use strings (us-ascii) for dict keys to make things more manageable.. --- bencode.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bencode.py b/bencode.py index 3a64056..8e05613 100644 --- a/bencode.py +++ b/bencode.py @@ -70,9 +70,10 @@ def decode_list(x, f): def decode_dict(x, f): r, f = {}, f+1 - lastkey = b'' + lastkey = '' while x[f] != b'e'[0]: k, f = decode_string(x, f) + k = k.decode('us-ascii') if lastkey >= k: raise ValueError lastkey = k @@ -199,8 +200,8 @@ def test_bdecode(): except ValueError: pass assert bdecode(b'de') == {} - assert bdecode(b'd3:agei25e4:eyes4:bluee') == {b'age': 25, b'eyes': b'blue'} - assert bdecode(b'd8:spam.mp3d6:author5:Alice6:lengthi100000eee') == {b'spam.mp3': {b'author': b'Alice', b'length': 100000}} + assert bdecode(b'd3:agei25e4:eyes4:bluee') == {'age': 25, 'eyes': b'blue'} + assert bdecode(b'd8:spam.mp3d6:author5:Alice6:lengthi100000eee') == {'spam.mp3': {'author': b'Alice', 'length': 100000}} try: bdecode(b'd3:fooe') assert 0 From f2b1a00971fe2147c6a1d125b56e261dbde7dbe7 Mon Sep 17 00:00:00 2001 From: John-Mark Gurney Date: Tue, 23 Aug 2022 14:06:37 -0700 Subject: [PATCH 04/11] first cut at verification of bittorrent files... This needs to be expanded a bit, and handle utf-8 encoded filenames.. --- __init__.py | 148 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 __init__.py diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..8a84135 --- /dev/null +++ b/__init__.py @@ -0,0 +1,148 @@ + +from . import bencode +from hashlib import sha1 +import importlib.resources +import itertools +import os +import pathlib +import shutil +import sys +import tempfile +import unittest + +class Storage: + def __init__(self, rootpath, files, piecelen): + self._rootpath = pathlib.Path(rootpath) + self._files = files + self._piecelen = piecelen + + self._buildindex() + + def _buildindex(self): + self._index = [] + files = iter(self._files) + left = 0 + curfile = None + + while True: + if curfile is None or curfileoff == curfile['length']: + # next file + try: + curfile = next(files) + fname = pathlib.Path( + *(x.decode('us-ascii') for x in + curfile['path'])) + curfilepath = self._rootpath / fname + except StopIteration: + break + curfileoff = 0 + + if left == 0: + current = [] + self._index.append(current) + left = self._piecelen + + sz = min(curfile['length'] - curfileoff, left) + + current.append(dict(file=curfilepath, fname=fname, + offset=curfileoff, size=sz)) + + curfileoff += sz + left -= sz + + def apply_piece(self, idx, fun): + for i in self._index[idx]: + with open(i['file'], 'rb') as fp: + fp.seek(i['offset']) + fun(fp.read(i['size'])) + +def validate(torrent, basedir): + info = torrent['info'] + + basedir = pathlib.Path(basedir) + + print(repr(torrent)) + + torrentdir = basedir / info['name'].decode('us-ascii') + + stor = Storage(torrentdir, info['files'], info['piece length']) + + pieces = info['pieces'] + for num, i in enumerate(pieces[x:x+20] for x in range(0, len(pieces), + 20)): + hash = sha1() + + stor.apply_piece(num, hash.update) + + if hash.digest() != i: + raise ValueError + +class _TestCases(unittest.TestCase): + dirname = 'somedir' + origfiledata = { + 'filea.txt': b'foo\n', + 'fileb.txt': b'bar\n', + 'filec.txt': b'bleha\n', + 'filed.txt': b'somehow\n', + 'filee.txt': b'nowab\n', + 'filef/filef.txt': b'\n', + } + + def setUp(self): + d = pathlib.Path(tempfile.mkdtemp()).resolve() + + tor = importlib.resources.files(__name__) + tor = tor / 'fixtures' / 'somedir.torrent' + with tor.open('rb') as fp: + self.torrent = bencode.bdecode(fp.read()) + + self.basetempdir = d + + self.oldcwd = os.getcwd() + + os.chdir(d) + + def tearDown(self): + shutil.rmtree(self.basetempdir) + + os.chdir(self.oldcwd) + + @staticmethod + def make_files(dname, fdict): + dname = pathlib.Path(dname) + for k, v in fdict.items(): + k = dname / pathlib.PurePosixPath(k) + k.parent.mkdir(parents=True, exist_ok=True) + with open(k, 'wb') as fp: + fp.write(v) + + def test_completeverif(self): + sd = self.basetempdir / self.dirname + sd.mkdir() + + self.make_files(sd, self.origfiledata) + + validate(self.torrent, self.basetempdir) + + def test_verification(self): + # Testing for "missing" files + # piece size 2 (aka 4 bytes) + # empty file of 4 bytes 'foo\n' + # complete file of 4 bytes 'bar\n' + # partial missing file, 6 bytes, last two correct 'bleha\n' + # complete file of 8 bytes (multiple pieces) 'somehow\n' + # partial missing file, starting w/ 2 bytes, length 6 'nowab\n' + # complete file (length 1) '\n' + + missingfiles = self.origfiledata.copy() + + missingfiles['filea.txt'] = b'' + missingfiles['filec.txt'] = b'\x00\x00\x00\x00a\n' + missingfiles['filee.txt'] = b'no' + + sd = self.basetempdir / self.dirname + sd.mkdir() + + self.make_files(sd, missingfiles) + + self.assertRaises(ValueError, validate, self.torrent, self.basetempdir) From c2918b9c8af71e2a217ec075ed7d4f14b33f0ff5 Mon Sep 17 00:00:00 2001 From: John-Mark Gurney Date: Tue, 23 Aug 2022 14:31:53 -0700 Subject: [PATCH 05/11] support specified encoding in torrent... drop debug print.. --- __init__.py | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/__init__.py b/__init__.py index 8a84135..f2582e8 100644 --- a/__init__.py +++ b/__init__.py @@ -11,10 +11,11 @@ import tempfile import unittest class Storage: - def __init__(self, rootpath, files, piecelen): + def __init__(self, rootpath, files, piecelen, encoding='us-ascii'): self._rootpath = pathlib.Path(rootpath) self._files = files self._piecelen = piecelen + self._encoding = encoding self._buildindex() @@ -30,7 +31,7 @@ class Storage: try: curfile = next(files) fname = pathlib.Path( - *(x.decode('us-ascii') for x in + *(x.decode(self._encoding) for x in curfile['path'])) curfilepath = self._rootpath / fname except StopIteration: @@ -61,11 +62,14 @@ def validate(torrent, basedir): basedir = pathlib.Path(basedir) - print(repr(torrent)) + try: + encoding = torrent['encoding'].decode('us-ascii') + except KeyError: + encoding = 'us-ascii' - torrentdir = basedir / info['name'].decode('us-ascii') + torrentdir = basedir / info['name'].decode(encoding) - stor = Storage(torrentdir, info['files'], info['piece length']) + stor = Storage(torrentdir, info['files'], info['piece length'], encoding) pieces = info['pieces'] for num, i in enumerate(pieces[x:x+20] for x in range(0, len(pieces), @@ -124,6 +128,21 @@ class _TestCases(unittest.TestCase): validate(self.torrent, self.basetempdir) + # encoded names + + sd = self.basetempdir / 'thai' + sd.mkdir() + + self.make_files(sd, { 'thai - สวัสดี.txt': b'hello\n' + }) + + tor = importlib.resources.files(__name__) + tor = tor / 'fixtures' / 'thai.torrent' + with tor.open('rb') as fp: + torrent = bencode.bdecode(fp.read()) + + validate(torrent, self.basetempdir) + def test_verification(self): # Testing for "missing" files # piece size 2 (aka 4 bytes) From a594159abb254cf13fd5abfbc64163064075a4b8 Mon Sep 17 00:00:00 2001 From: John-Mark Gurney Date: Tue, 23 Aug 2022 14:41:39 -0700 Subject: [PATCH 06/11] add missing fixture files.. --- fixtures/somedir.torrent | 4 ++++ fixtures/thai.torrent | Bin 0 -> 240 bytes 2 files changed, 4 insertions(+) create mode 100644 fixtures/somedir.torrent create mode 100644 fixtures/thai.torrent diff --git a/fixtures/somedir.torrent b/fixtures/somedir.torrent new file mode 100644 index 0000000..f1ef3f0 --- /dev/null +++ b/fixtures/somedir.torrent @@ -0,0 +1,4 @@ +d13:creation datei1661282749e4:infod5:filesld6:lengthi4e4:pathl9:filea.txteed6:lengthi4e4:pathl9:fileb.txteed6:lengthi6e4:pathl9:filec.txteed6:lengthi8e4:pathl9:filed.txteed6:lengthi6e4:pathl9:filee.txteed6:lengthi1e4:pathl5:filef9:filef.txteee4:name7:somedir12:piece lengthi4e6:pieces160:$醬l2B;'N-7/Q)vTUBY)-/: +bBP ؖs +G3 +)d3p%*i?g&S2 +_.7[2#e;InF͟2e5:nodeslee \ No newline at end of file diff --git a/fixtures/thai.torrent b/fixtures/thai.torrent new file mode 100644 index 0000000000000000000000000000000000000000..5a409b1c58768b1f43302ab20b1d245a4f595fd5 GIT binary patch literal 240 zcmXX>OKQU~5baU4DI|+HmaHi99H857k^?B#I393h*AgK;MK_^WD0J6Cz^BM27x1>* zLMe1r<+B-vd7pXD@)U{=O+d?5&shqK&N#mz@2PQ9k>rw%tVWe0N?DG1c4bG9*(f5e z6c;+zlA{K6MPtdWXRus6Otb`YWn*VhQjNa03TkxgP?16jtqD~vatM_nHeo~mKJ;%x z|8ZINLw`uZE@1o$&TQg5^SFSPut0Y6$;4d6Y}ca~Dg1P&*S{}Tj-DPrk9WVx{`>xA N4sA! Date: Tue, 23 Aug 2022 16:30:42 -0700 Subject: [PATCH 07/11] change how validate is implemented, return files good/bad.. --- __init__.py | 59 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 47 insertions(+), 12 deletions(-) diff --git a/__init__.py b/__init__.py index f2582e8..2170be4 100644 --- a/__init__.py +++ b/__init__.py @@ -1,5 +1,6 @@ from . import bencode +from functools import reduce from hashlib import sha1 import importlib.resources import itertools @@ -19,9 +20,22 @@ class Storage: self._buildindex() + def _filepaths(self): + for curfile in self._files: + fname = pathlib.Path( + *(x.decode(self._encoding) for x in + curfile['path'])) + curfilepath = self._rootpath / fname + + yield curfile, fname, curfilepath + + def allfiles(self): + for x, y, curfilepath in self._filepaths(): + yield curfilepath + def _buildindex(self): self._index = [] - files = iter(self._files) + files = self._filepaths() left = 0 curfile = None @@ -29,11 +43,7 @@ class Storage: if curfile is None or curfileoff == curfile['length']: # next file try: - curfile = next(files) - fname = pathlib.Path( - *(x.decode(self._encoding) for x in - curfile['path'])) - curfilepath = self._rootpath / fname + curfile, fname, curfilepath = next(files) except StopIteration: break curfileoff = 0 @@ -51,6 +61,10 @@ class Storage: curfileoff += sz left -= sz + def filesforpiece(self, idx): + for x in self._index[idx]: + yield x['file'] + def apply_piece(self, idx, fun): for i in self._index[idx]: with open(i['file'], 'rb') as fp: @@ -72,14 +86,28 @@ def validate(torrent, basedir): stor = Storage(torrentdir, info['files'], info['piece length'], encoding) pieces = info['pieces'] + piecescnt = len(pieces) // 20 + valid = [ None ] * piecescnt for num, i in enumerate(pieces[x:x+20] for x in range(0, len(pieces), 20)): hash = sha1() stor.apply_piece(num, hash.update) - if hash.digest() != i: - raise ValueError + if hash.digest() == i: + valid[num] = True + else: + valid[num] = False + + # if any piece of a file is bad, it's bad + allfiles = set(stor.allfiles()) + + badpieces = [ x for x, v in enumerate(valid) if not v ] + + badfiles = reduce(set.__or__, (set(stor.filesforpiece(x)) for x in + badpieces), set()) + + return allfiles - badfiles, badfiles class _TestCases(unittest.TestCase): dirname = 'somedir' @@ -155,13 +183,20 @@ class _TestCases(unittest.TestCase): missingfiles = self.origfiledata.copy() - missingfiles['filea.txt'] = b'' - missingfiles['filec.txt'] = b'\x00\x00\x00\x00a\n' - missingfiles['filee.txt'] = b'no' + badfiles = { + 'filea.txt': b'', + 'filec.txt': b'\x00\x00\x00\x00a\n', + 'filee.txt': b'no', + } + + missingfiles.update(badfiles) sd = self.basetempdir / self.dirname sd.mkdir() self.make_files(sd, missingfiles) - self.assertRaises(ValueError, validate, self.torrent, self.basetempdir) + val, inval = validate(self.torrent, self.basetempdir) + + self.assertEqual(set(val), { sd / x for x in missingfiles.keys() if x not in badfiles }) + self.assertEqual(set(inval), { sd / x for x in badfiles.keys() }) From 0865595d3ae811c201cd0cecc523c3a5f1eedff0 Mon Sep 17 00:00:00 2001 From: John-Mark Gurney Date: Tue, 23 Aug 2022 16:51:36 -0700 Subject: [PATCH 08/11] covert to iterating via files instead of pieces.. This uses an index to quickly look up what pieces are part of a file, and then checks that they are all valid, this should be faster as it is likely that the torrent has more pieces than files (few large files, vs many, many small files).. --- __init__.py | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/__init__.py b/__init__.py index 2170be4..e646c0d 100644 --- a/__init__.py +++ b/__init__.py @@ -21,8 +21,16 @@ class Storage: self._buildindex() def _filepaths(self): + '''Iterates over all the files in the torrent. + + Each item is a tuple of: + array of file path components (undecoded) + a pathlib.PurePath for the file + a pathlib.Path for file on disk + ''' + for curfile in self._files: - fname = pathlib.Path( + fname = pathlib.PurePath( *(x.decode(self._encoding) for x in curfile['path'])) curfilepath = self._rootpath / fname @@ -34,7 +42,8 @@ class Storage: yield curfilepath def _buildindex(self): - self._index = [] + self._pieceindex = [] + self._fileindex = {} files = self._filepaths() left = 0 curfile = None @@ -50,7 +59,9 @@ class Storage: if left == 0: current = [] - self._index.append(current) + self._fileindex.setdefault(fname, + []).append(len(self._pieceindex)) + self._pieceindex.append(current) left = self._piecelen sz = min(curfile['length'] - curfileoff, left) @@ -61,12 +72,15 @@ class Storage: curfileoff += sz left -= sz + def filepieces(self): + return self._fileindex.items() + def filesforpiece(self, idx): - for x in self._index[idx]: + for x in self._pieceindex[idx]: yield x['file'] def apply_piece(self, idx, fun): - for i in self._index[idx]: + for i in self._pieceindex[idx]: with open(i['file'], 'rb') as fp: fp.seek(i['offset']) fun(fp.read(i['size'])) @@ -83,7 +97,8 @@ def validate(torrent, basedir): torrentdir = basedir / info['name'].decode(encoding) - stor = Storage(torrentdir, info['files'], info['piece length'], encoding) + stor = Storage(torrentdir, info['files'], info['piece length'], + encoding) pieces = info['pieces'] piecescnt = len(pieces) // 20 @@ -102,10 +117,8 @@ def validate(torrent, basedir): # if any piece of a file is bad, it's bad allfiles = set(stor.allfiles()) - badpieces = [ x for x, v in enumerate(valid) if not v ] - - badfiles = reduce(set.__or__, (set(stor.filesforpiece(x)) for x in - badpieces), set()) + badfiles = { torrentdir / x for x, y in stor.filepieces() if + not all(valid[i] for i in y) } return allfiles - badfiles, badfiles @@ -198,5 +211,7 @@ class _TestCases(unittest.TestCase): val, inval = validate(self.torrent, self.basetempdir) - self.assertEqual(set(val), { sd / x for x in missingfiles.keys() if x not in badfiles }) - self.assertEqual(set(inval), { sd / x for x in badfiles.keys() }) + self.assertEqual(set(val), { sd / x for x in + missingfiles.keys() if x not in badfiles }) + self.assertEqual(set(inval), { sd / x for x in + badfiles.keys() }) From 7f94e49bc0cdde49d7a3d278bd839849be68f1ea Mon Sep 17 00:00:00 2001 From: John-Mark Gurney Date: Wed, 24 Aug 2022 16:02:14 -0700 Subject: [PATCH 09/11] move badfiles to class, encoding is ONLY UTF-8 per BEP-3 https://www.bittorrent.org/beps/bep_0003.html --- __init__.py | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/__init__.py b/__init__.py index e646c0d..9fb922e 100644 --- a/__init__.py +++ b/__init__.py @@ -11,12 +11,14 @@ import sys import tempfile import unittest +_encoding = 'utf-8' + class Storage: - def __init__(self, rootpath, files, piecelen, encoding='us-ascii'): + + def __init__(self, rootpath, files, piecelen): self._rootpath = pathlib.Path(rootpath) self._files = files self._piecelen = piecelen - self._encoding = encoding self._buildindex() @@ -31,7 +33,7 @@ class Storage: for curfile in self._files: fname = pathlib.PurePath( - *(x.decode(self._encoding) for x in + *(x.decode(_encoding) for x in curfile['path'])) curfilepath = self._rootpath / fname @@ -90,15 +92,9 @@ def validate(torrent, basedir): basedir = pathlib.Path(basedir) - try: - encoding = torrent['encoding'].decode('us-ascii') - except KeyError: - encoding = 'us-ascii' + torrentdir = basedir / info['name'].decode(_encoding) - torrentdir = basedir / info['name'].decode(encoding) - - stor = Storage(torrentdir, info['files'], info['piece length'], - encoding) + stor = Storage(torrentdir, info['files'], info['piece length']) pieces = info['pieces'] piecescnt = len(pieces) // 20 @@ -124,6 +120,8 @@ def validate(torrent, basedir): class _TestCases(unittest.TestCase): dirname = 'somedir' + + # file contents for somedir.torrent origfiledata = { 'filea.txt': b'foo\n', 'fileb.txt': b'bar\n', @@ -133,6 +131,13 @@ class _TestCases(unittest.TestCase): 'filef/filef.txt': b'\n', } + # some munging to make some files bad + badfiles = { + 'filea.txt': b'', + 'filec.txt': b'\x00\x00\x00\x00a\n', + 'filee.txt': b'no', + } + def setUp(self): d = pathlib.Path(tempfile.mkdtemp()).resolve() @@ -196,13 +201,7 @@ class _TestCases(unittest.TestCase): missingfiles = self.origfiledata.copy() - badfiles = { - 'filea.txt': b'', - 'filec.txt': b'\x00\x00\x00\x00a\n', - 'filee.txt': b'no', - } - - missingfiles.update(badfiles) + missingfiles.update(self.badfiles) sd = self.basetempdir / self.dirname sd.mkdir() @@ -212,6 +211,6 @@ class _TestCases(unittest.TestCase): val, inval = validate(self.torrent, self.basetempdir) self.assertEqual(set(val), { sd / x for x in - missingfiles.keys() if x not in badfiles }) + missingfiles.keys() if x not in self.badfiles }) self.assertEqual(set(inval), { sd / x for x in - badfiles.keys() }) + self.badfiles.keys() }) From 893a9b7d9e557f734f8032d47c423d7b9a4a80af Mon Sep 17 00:00:00 2001 From: John-Mark Gurney Date: Thu, 25 Aug 2022 12:45:01 -0700 Subject: [PATCH 10/11] update comment.. --- __init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/__init__.py b/__init__.py index 9fb922e..b02b93f 100644 --- a/__init__.py +++ b/__init__.py @@ -174,7 +174,7 @@ class _TestCases(unittest.TestCase): validate(self.torrent, self.basetempdir) - # encoded names + # that utf-8 encoded names work sd = self.basetempdir / 'thai' sd.mkdir() From 85f5c9a03e3773b6868ae83a075c9df8fc70c4d4 Mon Sep 17 00:00:00 2001 From: John-Mark Gurney Date: Thu, 25 Aug 2022 12:55:00 -0700 Subject: [PATCH 11/11] add doc strings, and limit what functions are exposed.. --- __init__.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/__init__.py b/__init__.py index b02b93f..02176d6 100644 --- a/__init__.py +++ b/__init__.py @@ -13,9 +13,19 @@ import unittest _encoding = 'utf-8' +__all__ = [ 'validate' ] + class Storage: + '''A class to help read pieces of a torrent. + ''' def __init__(self, rootpath, files, piecelen): + ''' + rootpath - path to the dir of torrent files are in + files - the files dictionary from the torrent info key + piecelen - piece length from the torren info key + ''' + self._rootpath = pathlib.Path(rootpath) self._files = files self._piecelen = piecelen @@ -40,10 +50,16 @@ class Storage: yield curfile, fname, curfilepath def allfiles(self): + '''Iterator that returns each on disk path name for + each file.''' + for x, y, curfilepath in self._filepaths(): yield curfilepath def _buildindex(self): + '''Internal function to build the needed indexes for + pieces and files.''' + self._pieceindex = [] self._fileindex = {} files = self._filepaths() @@ -75,19 +91,37 @@ class Storage: left -= sz def filepieces(self): + '''Iterator that returns a pair, first item is the subpath + to a file (that is relative to the torrent dir), and the + pieces that cover the file.''' + return self._fileindex.items() def filesforpiece(self, idx): + '''Return a list of files that are covered by piece idx.''' + for x in self._pieceindex[idx]: yield x['file'] def apply_piece(self, idx, fun): + '''Read the parts of piece idx, and call fun w/ each part. + + This is to hash the parts, e.g. + hash = sha1() + stor.apply_piece(num, hash.update) + + hash now contains the digest for the piece.''' + for i in self._pieceindex[idx]: with open(i['file'], 'rb') as fp: fp.seek(i['offset']) fun(fp.read(i['size'])) def validate(torrent, basedir): + '''Take a decode torrent file, where it was stored in basedir, + verify the torrent. Returns a pair of set, the first is all the + files that are valid, the second are all the invalid files.''' + info = torrent['info'] basedir = pathlib.Path(basedir)