You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
351 lines
13 KiB
351 lines
13 KiB
3 years ago
|
#!/usr/bin/env python3
|
||
|
#
|
||
|
# linearize-data.py: Construct a linear, no-fork version of the chain.
|
||
|
#
|
||
|
# Copyright (c) 2013-2020 The Bitcoin Core developers
|
||
|
# Distributed under the MIT software license, see the accompanying
|
||
|
# file COPYING or http://www.opensource.org/licenses/mit-license.php.
|
||
|
#
|
||
|
|
||
|
import struct
|
||
|
import re
|
||
|
import os
|
||
|
import os.path
|
||
|
import sys
|
||
|
import hashlib
|
||
|
import datetime
|
||
|
import time
|
||
|
import glob
|
||
|
from collections import namedtuple
|
||
|
from binascii import unhexlify
|
||
|
|
||
|
settings = {}
|
||
|
|
||
|
def hex_switchEndian(s):
|
||
|
""" Switches the endianness of a hex string (in pairs of hex chars) """
|
||
|
pairList = [s[i:i+2].encode() for i in range(0, len(s), 2)]
|
||
|
return b''.join(pairList[::-1]).decode()
|
||
|
|
||
|
def uint32(x):
|
||
|
return x & 0xffffffff
|
||
|
|
||
|
def bytereverse(x):
|
||
|
return uint32(( ((x) << 24) | (((x) << 8) & 0x00ff0000) |
|
||
|
(((x) >> 8) & 0x0000ff00) | ((x) >> 24) ))
|
||
|
|
||
|
def bufreverse(in_buf):
|
||
|
out_words = []
|
||
|
for i in range(0, len(in_buf), 4):
|
||
|
word = struct.unpack('@I', in_buf[i:i+4])[0]
|
||
|
out_words.append(struct.pack('@I', bytereverse(word)))
|
||
|
return b''.join(out_words)
|
||
|
|
||
|
def wordreverse(in_buf):
|
||
|
out_words = []
|
||
|
for i in range(0, len(in_buf), 4):
|
||
|
out_words.append(in_buf[i:i+4])
|
||
|
out_words.reverse()
|
||
|
return b''.join(out_words)
|
||
|
|
||
|
def calc_hdr_hash(blk_hdr):
|
||
|
hash1 = hashlib.sha256()
|
||
|
hash1.update(blk_hdr)
|
||
|
hash1_o = hash1.digest()
|
||
|
|
||
|
hash2 = hashlib.sha256()
|
||
|
hash2.update(hash1_o)
|
||
|
hash2_o = hash2.digest()
|
||
|
|
||
|
return hash2_o
|
||
|
|
||
|
def calc_hash_str(blk_hdr):
|
||
|
hash = calc_hdr_hash(blk_hdr)
|
||
|
hash = bufreverse(hash)
|
||
|
hash = wordreverse(hash)
|
||
|
hash_str = hash.hex()
|
||
|
return hash_str
|
||
|
|
||
|
def get_blk_dt(blk_hdr):
|
||
|
members = struct.unpack("<I", blk_hdr[68:68+4])
|
||
|
nTime = members[0]
|
||
|
dt = datetime.datetime.fromtimestamp(nTime)
|
||
|
dt_ym = datetime.datetime(dt.year, dt.month, 1)
|
||
|
return (dt_ym, nTime)
|
||
|
|
||
|
# When getting the list of block hashes, undo any byte reversals.
|
||
|
def get_block_hashes(settings):
|
||
|
blkindex = []
|
||
|
f = open(settings['hashlist'], "r", encoding="utf8")
|
||
|
for line in f:
|
||
|
line = line.rstrip()
|
||
|
if settings['rev_hash_bytes'] == 'true':
|
||
|
line = hex_switchEndian(line)
|
||
|
blkindex.append(line)
|
||
|
|
||
|
print("Read " + str(len(blkindex)) + " hashes")
|
||
|
|
||
|
return blkindex
|
||
|
|
||
|
# The block map shouldn't give or receive byte-reversed hashes.
|
||
|
def mkblockmap(blkindex):
|
||
|
blkmap = {}
|
||
|
for height,hash in enumerate(blkindex):
|
||
|
blkmap[hash] = height
|
||
|
return blkmap
|
||
|
|
||
|
# This gets the first block file ID that exists from the input block
|
||
|
# file directory.
|
||
|
def getFirstBlockFileId(block_dir_path):
|
||
|
# First, this sets up a pattern to search for block files, for
|
||
|
# example 'blkNNNNN.dat'.
|
||
|
blkFilePattern = os.path.join(block_dir_path, "blk[0-9][0-9][0-9][0-9][0-9].dat")
|
||
|
|
||
|
# This search is done with glob
|
||
|
blkFnList = glob.glob(blkFilePattern)
|
||
|
|
||
|
if len(blkFnList) == 0:
|
||
|
print("blocks not pruned - starting at 0")
|
||
|
return 0
|
||
|
# We then get the lexicographic minimum, which should be the first
|
||
|
# block file name.
|
||
|
firstBlkFilePath = min(blkFnList)
|
||
|
firstBlkFn = os.path.basename(firstBlkFilePath)
|
||
|
|
||
|
# now, the string should be ['b','l','k','N','N','N','N','N','.','d','a','t']
|
||
|
# So get the ID by choosing: 3 4 5 6 7
|
||
|
# The ID is not necessarily 0 if this is a pruned node.
|
||
|
blkId = int(firstBlkFn[3:8])
|
||
|
return blkId
|
||
|
|
||
|
# Block header and extent on disk
|
||
|
BlockExtent = namedtuple('BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size'])
|
||
|
|
||
|
class BlockDataCopier:
|
||
|
def __init__(self, settings, blkindex, blkmap):
|
||
|
self.settings = settings
|
||
|
self.blkindex = blkindex
|
||
|
self.blkmap = blkmap
|
||
|
|
||
|
# Get first occurring block file id - for pruned nodes this
|
||
|
# will not necessarily be 0
|
||
|
self.inFn = getFirstBlockFileId(self.settings['input'])
|
||
|
self.inF = None
|
||
|
self.outFn = 0
|
||
|
self.outsz = 0
|
||
|
self.outF = None
|
||
|
self.outFname = None
|
||
|
self.blkCountIn = 0
|
||
|
self.blkCountOut = 0
|
||
|
|
||
|
self.lastDate = datetime.datetime(2000, 1, 1)
|
||
|
self.highTS = 1408893517 - 315360000
|
||
|
self.timestampSplit = False
|
||
|
self.fileOutput = True
|
||
|
self.setFileTime = False
|
||
|
self.maxOutSz = settings['max_out_sz']
|
||
|
if 'output' in settings:
|
||
|
self.fileOutput = False
|
||
|
if settings['file_timestamp'] != 0:
|
||
|
self.setFileTime = True
|
||
|
if settings['split_timestamp'] != 0:
|
||
|
self.timestampSplit = True
|
||
|
# Extents and cache for out-of-order blocks
|
||
|
self.blockExtents = {}
|
||
|
self.outOfOrderData = {}
|
||
|
self.outOfOrderSize = 0 # running total size for items in outOfOrderData
|
||
|
|
||
|
def writeBlock(self, inhdr, blk_hdr, rawblock):
|
||
|
blockSizeOnDisk = len(inhdr) + len(blk_hdr) + len(rawblock)
|
||
|
if not self.fileOutput and ((self.outsz + blockSizeOnDisk) > self.maxOutSz):
|
||
|
self.outF.close()
|
||
|
if self.setFileTime:
|
||
|
os.utime(self.outFname, (int(time.time()), self.highTS))
|
||
|
self.outF = None
|
||
|
self.outFname = None
|
||
|
self.outFn = self.outFn + 1
|
||
|
self.outsz = 0
|
||
|
|
||
|
(blkDate, blkTS) = get_blk_dt(blk_hdr)
|
||
|
if self.timestampSplit and (blkDate > self.lastDate):
|
||
|
print("New month " + blkDate.strftime("%Y-%m") + " @ " + self.hash_str)
|
||
|
self.lastDate = blkDate
|
||
|
if self.outF:
|
||
|
self.outF.close()
|
||
|
if self.setFileTime:
|
||
|
os.utime(self.outFname, (int(time.time()), self.highTS))
|
||
|
self.outF = None
|
||
|
self.outFname = None
|
||
|
self.outFn = self.outFn + 1
|
||
|
self.outsz = 0
|
||
|
|
||
|
if not self.outF:
|
||
|
if self.fileOutput:
|
||
|
self.outFname = self.settings['output_file']
|
||
|
else:
|
||
|
self.outFname = os.path.join(self.settings['output'], "blk%05d.dat" % self.outFn)
|
||
|
print("Output file " + self.outFname)
|
||
|
self.outF = open(self.outFname, "wb")
|
||
|
|
||
|
self.outF.write(inhdr)
|
||
|
self.outF.write(blk_hdr)
|
||
|
self.outF.write(rawblock)
|
||
|
self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock)
|
||
|
|
||
|
self.blkCountOut = self.blkCountOut + 1
|
||
|
if blkTS > self.highTS:
|
||
|
self.highTS = blkTS
|
||
|
|
||
|
if (self.blkCountOut % 1000) == 0:
|
||
|
print('%i blocks scanned, %i blocks written (of %i, %.1f%% complete)' %
|
||
|
(self.blkCountIn, self.blkCountOut, len(self.blkindex), 100.0 * self.blkCountOut / len(self.blkindex)))
|
||
|
|
||
|
def inFileName(self, fn):
|
||
|
return os.path.join(self.settings['input'], "blk%05d.dat" % fn)
|
||
|
|
||
|
def fetchBlock(self, extent):
|
||
|
'''Fetch block contents from disk given extents'''
|
||
|
with open(self.inFileName(extent.fn), "rb") as f:
|
||
|
f.seek(extent.offset)
|
||
|
return f.read(extent.size)
|
||
|
|
||
|
def copyOneBlock(self):
|
||
|
'''Find the next block to be written in the input, and copy it to the output.'''
|
||
|
extent = self.blockExtents.pop(self.blkCountOut)
|
||
|
if self.blkCountOut in self.outOfOrderData:
|
||
|
# If the data is cached, use it from memory and remove from the cache
|
||
|
rawblock = self.outOfOrderData.pop(self.blkCountOut)
|
||
|
self.outOfOrderSize -= len(rawblock)
|
||
|
else: # Otherwise look up data on disk
|
||
|
rawblock = self.fetchBlock(extent)
|
||
|
|
||
|
self.writeBlock(extent.inhdr, extent.blkhdr, rawblock)
|
||
|
|
||
|
def run(self):
|
||
|
while self.blkCountOut < len(self.blkindex):
|
||
|
if not self.inF:
|
||
|
fname = self.inFileName(self.inFn)
|
||
|
print("Input file " + fname)
|
||
|
try:
|
||
|
self.inF = open(fname, "rb")
|
||
|
except IOError:
|
||
|
print("Premature end of block data")
|
||
|
return
|
||
|
|
||
|
inhdr = self.inF.read(8)
|
||
|
if (not inhdr or (inhdr[0] == "\0")):
|
||
|
self.inF.close()
|
||
|
self.inF = None
|
||
|
self.inFn = self.inFn + 1
|
||
|
continue
|
||
|
|
||
|
inMagic = inhdr[:4]
|
||
|
if (inMagic != self.settings['netmagic']):
|
||
|
# Seek backwards 7 bytes (skipping the first byte in the previous search)
|
||
|
# and continue searching from the new position if the magic bytes are not
|
||
|
# found.
|
||
|
self.inF.seek(-7, os.SEEK_CUR)
|
||
|
continue
|
||
|
inLenLE = inhdr[4:]
|
||
|
su = struct.unpack("<I", inLenLE)
|
||
|
inLen = su[0] - 80 # length without header
|
||
|
blk_hdr = self.inF.read(80)
|
||
|
inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen)
|
||
|
|
||
|
self.hash_str = calc_hash_str(blk_hdr)
|
||
|
if not self.hash_str in blkmap:
|
||
|
# Because blocks can be written to files out-of-order as of 0.10, the script
|
||
|
# may encounter blocks it doesn't know about. Treat as debug output.
|
||
|
if settings['debug_output'] == 'true':
|
||
|
print("Skipping unknown block " + self.hash_str)
|
||
|
self.inF.seek(inLen, os.SEEK_CUR)
|
||
|
continue
|
||
|
|
||
|
blkHeight = self.blkmap[self.hash_str]
|
||
|
self.blkCountIn += 1
|
||
|
|
||
|
if self.blkCountOut == blkHeight:
|
||
|
# If in-order block, just copy
|
||
|
rawblock = self.inF.read(inLen)
|
||
|
self.writeBlock(inhdr, blk_hdr, rawblock)
|
||
|
|
||
|
# See if we can catch up to prior out-of-order blocks
|
||
|
while self.blkCountOut in self.blockExtents:
|
||
|
self.copyOneBlock()
|
||
|
|
||
|
else: # If out-of-order, skip over block data for now
|
||
|
self.blockExtents[blkHeight] = inExtent
|
||
|
if self.outOfOrderSize < self.settings['out_of_order_cache_sz']:
|
||
|
# If there is space in the cache, read the data
|
||
|
# Reading the data in file sequence instead of seeking and fetching it later is preferred,
|
||
|
# but we don't want to fill up memory
|
||
|
self.outOfOrderData[blkHeight] = self.inF.read(inLen)
|
||
|
self.outOfOrderSize += inLen
|
||
|
else: # If no space in cache, seek forward
|
||
|
self.inF.seek(inLen, os.SEEK_CUR)
|
||
|
|
||
|
print("Done (%i blocks written)" % (self.blkCountOut))
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
if len(sys.argv) != 2:
|
||
|
print("Usage: linearize-data.py CONFIG-FILE")
|
||
|
sys.exit(1)
|
||
|
|
||
|
f = open(sys.argv[1], encoding="utf8")
|
||
|
for line in f:
|
||
|
# skip comment lines
|
||
|
m = re.search(r'^\s*#', line)
|
||
|
if m:
|
||
|
continue
|
||
|
|
||
|
# parse key=value lines
|
||
|
m = re.search(r'^(\w+)\s*=\s*(\S.*)$', line)
|
||
|
if m is None:
|
||
|
continue
|
||
|
settings[m.group(1)] = m.group(2)
|
||
|
f.close()
|
||
|
|
||
|
# Force hash byte format setting to be lowercase to make comparisons easier.
|
||
|
# Also place upfront in case any settings need to know about it.
|
||
|
if 'rev_hash_bytes' not in settings:
|
||
|
settings['rev_hash_bytes'] = 'false'
|
||
|
settings['rev_hash_bytes'] = settings['rev_hash_bytes'].lower()
|
||
|
|
||
|
if 'netmagic' not in settings:
|
||
|
settings['netmagic'] = 'f9beb4d9'
|
||
|
if 'genesis' not in settings:
|
||
|
settings['genesis'] = '000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f'
|
||
|
if 'input' not in settings:
|
||
|
settings['input'] = 'input'
|
||
|
if 'hashlist' not in settings:
|
||
|
settings['hashlist'] = 'hashlist.txt'
|
||
|
if 'file_timestamp' not in settings:
|
||
|
settings['file_timestamp'] = 0
|
||
|
if 'split_timestamp' not in settings:
|
||
|
settings['split_timestamp'] = 0
|
||
|
if 'max_out_sz' not in settings:
|
||
|
settings['max_out_sz'] = 1000 * 1000 * 1000
|
||
|
if 'out_of_order_cache_sz' not in settings:
|
||
|
settings['out_of_order_cache_sz'] = 100 * 1000 * 1000
|
||
|
if 'debug_output' not in settings:
|
||
|
settings['debug_output'] = 'false'
|
||
|
|
||
|
settings['max_out_sz'] = int(settings['max_out_sz'])
|
||
|
settings['split_timestamp'] = int(settings['split_timestamp'])
|
||
|
settings['file_timestamp'] = int(settings['file_timestamp'])
|
||
|
settings['netmagic'] = unhexlify(settings['netmagic'].encode('utf-8'))
|
||
|
settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz'])
|
||
|
settings['debug_output'] = settings['debug_output'].lower()
|
||
|
|
||
|
if 'output_file' not in settings and 'output' not in settings:
|
||
|
print("Missing output file / directory")
|
||
|
sys.exit(1)
|
||
|
|
||
|
blkindex = get_block_hashes(settings)
|
||
|
blkmap = mkblockmap(blkindex)
|
||
|
|
||
|
# Block hash map won't be byte-reversed. Neither should the genesis hash.
|
||
|
if not settings['genesis'] in blkmap:
|
||
|
print("Genesis block not found in hashlist")
|
||
|
else:
|
||
|
BlockDataCopier(settings, blkindex, blkmap).run()
|