1 // Copyright (c) 2015-2016 The btcsuite developers
2 // Use of this source code is governed by an ISC
3 // license that can be found in the LICENSE file.
5 // This file contains the implementation functions for reading, writing, and
6 // otherwise working with the flat files that house the actual blocks.
20 "github.com/btcsuite/btcd/chaincfg/chainhash"
21 "github.com/btcsuite/btcd/database"
22 "github.com/btcsuite/btcd/wire"
26 // The Bitcoin protocol encodes block height as int32, so max number of
27 // blocks is 2^31. Max block size per the protocol is 32MiB per block.
28 // So the theoretical max at the time this comment was written is 64PiB
29 // (pebibytes). With files @ 512MiB each, this would require a maximum
30 // of 134,217,728 files. Thus, choose 9 digits of precision for the
31 // filenames. An additional benefit is 9 digits provides 10^9 files @
32 // 512MiB each for a total of ~476.84PiB (roughly 7.4 times the current
33 // theoretical max), so there is room for the max block size to grow in
35 blockFilenameTemplate = "%09d.fdb"
37 // maxOpenFiles is the max number of open files to maintain in the
38 // open blocks cache. Note that this does not include the current
39 // write file, so there will typically be one more than this value open.
42 // maxBlockFileSize is the maximum size for each file used to store
45 // NOTE: The current code uses uint32 for all offsets, so this value
46 // must be less than 2^32 (4 GiB). This is also why it's a typed
48 maxBlockFileSize uint32 = 512 * 1024 * 1024 // 512 MiB
50 // blockLocSize is the number of bytes the serialized block location
51 // data that is stored in the block index.
53 // The serialized block location format is:
55 // [0:4] Block file (4 bytes)
56 // [4:8] File offset (4 bytes)
57 // [8:12] Block length (4 bytes)
62 // castagnoli houses the Catagnoli polynomial used for CRC-32 checksums.
63 castagnoli = crc32.MakeTable(crc32.Castagnoli)
66 // filer is an interface which acts very similar to a *os.File and is typically
67 // implemented by it. It exists so the test code can provide mock files for
68 // properly testing corruption and file system issues.
69 type filer interface {
73 Truncate(size int64) error
77 // lockableFile represents a block file on disk that has been opened for either
78 // read or read/write access. It also contains a read-write mutex to support
79 // multiple concurrent readers.
80 type lockableFile struct {
85 // writeCursor represents the current file and offset of the block file on disk
86 // for performing all writes. It also contains a read-write mutex to support
87 // multiple concurrent readers which can reuse the file handle.
88 type writeCursor struct {
91 // curFile is the current block file that will be appended to when
92 // writing new blocks.
95 // curFileNum is the current block file number and is used to allow
96 // readers to use the same open file handle.
99 // curOffset is the offset in the current write block file where the
100 // next new block will be written.
104 // blockStore houses information used to handle reading and writing blocks (and
105 // part of blocks) into flat files with support for multiple concurrent readers.
106 type blockStore struct {
107 // network is the specific network to use in the flat files for each
109 network wire.BitcoinNet
111 // basePath is the base path used for the flat block files and metadata.
114 // maxBlockFileSize is the maximum size for each file used to store
115 // blocks. It is defined on the store so the whitebox tests can
116 // override the value.
117 maxBlockFileSize uint32
119 // The following fields are related to the flat files which hold the
120 // actual blocks. The number of open files is limited by maxOpenFiles.
122 // obfMutex protects concurrent access to the openBlockFiles map. It is
123 // a RWMutex so multiple readers can simultaneously access open files.
125 // openBlockFiles houses the open file handles for existing block files
126 // which have been opened read-only along with an individual RWMutex.
127 // This scheme allows multiple concurrent readers to the same file while
128 // preventing the file from being closed out from under them.
130 // lruMutex protects concurrent access to the least recently used list
133 // openBlocksLRU tracks how the open files are refenced by pushing the
134 // most recently used files to the front of the list thereby trickling
135 // the least recently used files to end of the list. When a file needs
136 // to be closed due to exceeding the the max number of allowed open
137 // files, the one at the end of the list is closed.
139 // fileNumToLRUElem is a mapping between a specific block file number
140 // and the associated list element on the least recently used list.
142 // Thus, with the combination of these fields, the database supports
143 // concurrent non-blocking reads across multiple and individual files
144 // along with intelligently limiting the number of open file handles by
145 // closing the least recently used files as needed.
147 // NOTE: The locking order used throughout is well-defined and MUST be
148 // followed. Failure to do so could lead to deadlocks. In particular,
149 // the locking order is as follows:
152 // 3) writeCursor mutex
153 // 4) specific file mutexes
155 // None of the mutexes are required to be locked at the same time, and
156 // often aren't. However, if they are to be locked simultaneously, they
157 // MUST be locked in the order previously specified.
159 // Due to the high performance and multi-read concurrency requirements,
160 // write locks should only be held for the minimum time necessary.
161 obfMutex sync.RWMutex
163 openBlocksLRU *list.List // Contains uint32 block file numbers.
164 fileNumToLRUElem map[uint32]*list.Element
165 openBlockFiles map[uint32]*lockableFile
167 // writeCursor houses the state for the current file and location that
168 // new blocks are written to.
169 writeCursor *writeCursor
171 // These functions are set to openFile, openWriteFile, and deleteFile by
172 // default, but are exposed here to allow the whitebox tests to replace
173 // them when working with mock files.
174 openFileFunc func(fileNum uint32) (*lockableFile, error)
175 openWriteFileFunc func(fileNum uint32) (filer, error)
176 deleteFileFunc func(fileNum uint32) error
179 // blockLocation identifies a particular block file and location.
180 type blockLocation struct {
186 // deserializeBlockLoc deserializes the passed serialized block location
187 // information. This is data stored into the block index metadata for each
188 // block. The serialized data passed to this function MUST be at least
189 // blockLocSize bytes or it will panic. The error check is avoided here because
190 // this information will always be coming from the block index which includes a
191 // checksum to detect corruption. Thus it is safe to use this unchecked here.
192 func deserializeBlockLoc(serializedLoc []byte) blockLocation {
193 // The serialized block location format is:
195 // [0:4] Block file (4 bytes)
196 // [4:8] File offset (4 bytes)
197 // [8:12] Block length (4 bytes)
198 return blockLocation{
199 blockFileNum: byteOrder.Uint32(serializedLoc[0:4]),
200 fileOffset: byteOrder.Uint32(serializedLoc[4:8]),
201 blockLen: byteOrder.Uint32(serializedLoc[8:12]),
205 // serializeBlockLoc returns the serialization of the passed block location.
206 // This is data to be stored into the block index metadata for each block.
207 func serializeBlockLoc(loc blockLocation) []byte {
208 // The serialized block location format is:
210 // [0:4] Block file (4 bytes)
211 // [4:8] File offset (4 bytes)
212 // [8:12] Block length (4 bytes)
213 var serializedData [12]byte
214 byteOrder.PutUint32(serializedData[0:4], loc.blockFileNum)
215 byteOrder.PutUint32(serializedData[4:8], loc.fileOffset)
216 byteOrder.PutUint32(serializedData[8:12], loc.blockLen)
217 return serializedData[:]
220 // blockFilePath return the file path for the provided block file number.
221 func blockFilePath(dbPath string, fileNum uint32) string {
222 fileName := fmt.Sprintf(blockFilenameTemplate, fileNum)
223 return filepath.Join(dbPath, fileName)
226 // openWriteFile returns a file handle for the passed flat file number in
227 // read/write mode. The file will be created if needed. It is typically used
228 // for the current file that will have all new data appended. Unlike openFile,
229 // this function does not keep track of the open file and it is not subject to
230 // the maxOpenFiles limit.
231 func (s *blockStore) openWriteFile(fileNum uint32) (filer, error) {
232 // The current block file needs to be read-write so it is possible to
233 // append to it. Also, it shouldn't be part of the least recently used
235 filePath := blockFilePath(s.basePath, fileNum)
236 file, err := os.OpenFile(filePath, os.O_RDWR|os.O_CREATE, 0666)
238 str := fmt.Sprintf("failed to open file %q: %v", filePath, err)
239 return nil, makeDbErr(database.ErrDriverSpecific, str, err)
245 // openFile returns a read-only file handle for the passed flat file number.
246 // The function also keeps track of the open files, performs least recently
247 // used tracking, and limits the number of open files to maxOpenFiles by closing
248 // the least recently used file as needed.
250 // This function MUST be called with the overall files mutex (s.obfMutex) locked
252 func (s *blockStore) openFile(fileNum uint32) (*lockableFile, error) {
253 // Open the appropriate file as read-only.
254 filePath := blockFilePath(s.basePath, fileNum)
255 file, err := os.Open(filePath)
257 return nil, makeDbErr(database.ErrDriverSpecific, err.Error(),
260 blockFile := &lockableFile{file: file}
262 // Close the least recently used file if the file exceeds the max
263 // allowed open files. This is not done until after the file open in
264 // case the file fails to open, there is no need to close any files.
266 // A write lock is required on the LRU list here to protect against
267 // modifications happening as already open files are read from and
268 // shuffled to the front of the list.
270 // Also, add the file that was just opened to the front of the least
271 // recently used list to indicate it is the most recently used file and
272 // therefore should be closed last.
274 lruList := s.openBlocksLRU
275 if lruList.Len() >= maxOpenFiles {
276 lruFileNum := lruList.Remove(lruList.Back()).(uint32)
277 oldBlockFile := s.openBlockFiles[lruFileNum]
279 // Close the old file under the write lock for the file in case
280 // any readers are currently reading from it so it's not closed
281 // out from under them.
283 _ = oldBlockFile.file.Close()
284 oldBlockFile.Unlock()
286 delete(s.openBlockFiles, lruFileNum)
287 delete(s.fileNumToLRUElem, lruFileNum)
289 s.fileNumToLRUElem[fileNum] = lruList.PushFront(fileNum)
292 // Store a reference to it in the open block files map.
293 s.openBlockFiles[fileNum] = blockFile
295 return blockFile, nil
298 // deleteFile removes the block file for the passed flat file number. The file
299 // must already be closed and it is the responsibility of the caller to do any
300 // other state cleanup necessary.
301 func (s *blockStore) deleteFile(fileNum uint32) error {
302 filePath := blockFilePath(s.basePath, fileNum)
303 if err := os.Remove(filePath); err != nil {
304 return makeDbErr(database.ErrDriverSpecific, err.Error(), err)
310 // blockFile attempts to return an existing file handle for the passed flat file
311 // number if it is already open as well as marking it as most recently used. It
312 // will also open the file when it's not already open subject to the rules
313 // described in openFile.
315 // NOTE: The returned block file will already have the read lock acquired and
316 // the caller MUST call .RUnlock() to release it once it has finished all read
317 // operations. This is necessary because otherwise it would be possible for a
318 // separate goroutine to close the file after it is returned from here, but
319 // before the caller has acquired a read lock.
320 func (s *blockStore) blockFile(fileNum uint32) (*lockableFile, error) {
321 // When the requested block file is open for writes, return it.
324 if fileNum == wc.curFileNum && wc.curFile.file != nil {
332 // Try to return an open file under the overall files read lock.
334 if obf, ok := s.openBlockFiles[fileNum]; ok {
336 s.openBlocksLRU.MoveToFront(s.fileNumToLRUElem[fileNum])
345 // Since the file isn't open already, need to check the open block files
346 // map again under write lock in case multiple readers got here and a
347 // separate one is already opening the file.
349 if obf, ok := s.openBlockFiles[fileNum]; ok {
355 // The file isn't open, so open it while potentially closing the least
356 // recently used one as needed.
357 obf, err := s.openFileFunc(fileNum)
367 // writeData is a helper function for writeBlock which writes the provided data
368 // at the current write offset and updates the write cursor accordingly. The
369 // field name parameter is only used when there is an error to provide a nicer
372 // The write cursor will be advanced the number of bytes actually written in the
375 // NOTE: This function MUST be called with the write cursor current file lock
376 // held and must only be called during a write transaction so it is effectively
377 // locked for writes. Also, the write cursor current file must NOT be nil.
378 func (s *blockStore) writeData(data []byte, fieldName string) error {
380 n, err := wc.curFile.file.WriteAt(data, int64(wc.curOffset))
381 wc.curOffset += uint32(n)
383 str := fmt.Sprintf("failed to write %s to file %d at "+
384 "offset %d: %v", fieldName, wc.curFileNum,
385 wc.curOffset-uint32(n), err)
386 return makeDbErr(database.ErrDriverSpecific, str, err)
392 // writeBlock appends the specified raw block bytes to the store's write cursor
393 // location and increments it accordingly. When the block would exceed the max
394 // file size for the current flat file, this function will close the current
395 // file, create the next file, update the write cursor, and write the block to
398 // The write cursor will also be advanced the number of bytes actually written
399 // in the event of failure.
401 // Format: <network><block length><serialized block><checksum>
402 func (s *blockStore) writeBlock(rawBlock []byte) (blockLocation, error) {
403 // Compute how many bytes will be written.
404 // 4 bytes each for block network + 4 bytes for block length +
405 // length of raw block + 4 bytes for checksum.
406 blockLen := uint32(len(rawBlock))
407 fullLen := blockLen + 12
409 // Move to the next block file if adding the new block would exceed the
410 // max allowed size for the current block file. Also detect overflow
411 // to be paranoid, even though it isn't possible currently, numbers
412 // might change in the future to make it possible.
414 // NOTE: The writeCursor.offset field isn't protected by the mutex
415 // since it's only read/changed during this function which can only be
416 // called during a write transaction, of which there can be only one at
419 finalOffset := wc.curOffset + fullLen
420 if finalOffset < wc.curOffset || finalOffset > s.maxBlockFileSize {
421 // This is done under the write cursor lock since the curFileNum
422 // field is accessed elsewhere by readers.
424 // Close the current write file to force a read-only reopen
425 // with LRU tracking. The close is done under the write lock
426 // for the file to prevent it from being closed out from under
427 // any readers currently reading from it.
430 if wc.curFile.file != nil {
431 _ = wc.curFile.file.Close()
432 wc.curFile.file = nil
436 // Start writes into next file.
442 // All writes are done under the write lock for the file to ensure any
443 // readers are finished and blocked first.
445 defer wc.curFile.Unlock()
447 // Open the current file if needed. This will typically only be the
448 // case when moving to the next file to write to or on initial database
449 // load. However, it might also be the case if rollbacks happened after
450 // file writes started during a transaction commit.
451 if wc.curFile.file == nil {
452 file, err := s.openWriteFileFunc(wc.curFileNum)
454 return blockLocation{}, err
456 wc.curFile.file = file
460 origOffset := wc.curOffset
461 hasher := crc32.New(castagnoli)
463 byteOrder.PutUint32(scratch[:], uint32(s.network))
464 if err := s.writeData(scratch[:], "network"); err != nil {
465 return blockLocation{}, err
467 _, _ = hasher.Write(scratch[:])
470 byteOrder.PutUint32(scratch[:], blockLen)
471 if err := s.writeData(scratch[:], "block length"); err != nil {
472 return blockLocation{}, err
474 _, _ = hasher.Write(scratch[:])
477 if err := s.writeData(rawBlock[:], "block"); err != nil {
478 return blockLocation{}, err
480 _, _ = hasher.Write(rawBlock)
482 // Castagnoli CRC-32 as a checksum of all the previous.
483 if err := s.writeData(hasher.Sum(nil), "checksum"); err != nil {
484 return blockLocation{}, err
487 loc := blockLocation{
488 blockFileNum: wc.curFileNum,
489 fileOffset: origOffset,
495 // readBlock reads the specified block record and returns the serialized block.
496 // It ensures the integrity of the block data by checking that the serialized
497 // network matches the current network associated with the block store and
498 // comparing the calculated checksum against the one stored in the flat file.
499 // This function also automatically handles all file management such as opening
500 // and closing files as necessary to stay within the maximum allowed open files
503 // Returns ErrDriverSpecific if the data fails to read for any reason and
504 // ErrCorruption if the checksum of the read data doesn't match the checksum
505 // read from the file.
507 // Format: <network><block length><serialized block><checksum>
508 func (s *blockStore) readBlock(hash *chainhash.Hash, loc blockLocation) ([]byte, error) {
509 // Get the referenced block file handle opening the file as needed. The
510 // function also handles closing files as needed to avoid going over the
511 // max allowed open files.
512 blockFile, err := s.blockFile(loc.blockFileNum)
517 serializedData := make([]byte, loc.blockLen)
518 n, err := blockFile.file.ReadAt(serializedData, int64(loc.fileOffset))
521 str := fmt.Sprintf("failed to read block %s from file %d, "+
522 "offset %d: %v", hash, loc.blockFileNum, loc.fileOffset,
524 return nil, makeDbErr(database.ErrDriverSpecific, str, err)
527 // Calculate the checksum of the read data and ensure it matches the
528 // serialized checksum. This will detect any data corruption in the
529 // flat file without having to do much more expensive merkle root
530 // calculations on the loaded block.
531 serializedChecksum := binary.BigEndian.Uint32(serializedData[n-4:])
532 calculatedChecksum := crc32.Checksum(serializedData[:n-4], castagnoli)
533 if serializedChecksum != calculatedChecksum {
534 str := fmt.Sprintf("block data for block %s checksum "+
535 "does not match - got %x, want %x", hash,
536 calculatedChecksum, serializedChecksum)
537 return nil, makeDbErr(database.ErrCorruption, str, nil)
540 // The network associated with the block must match the current active
541 // network, otherwise somebody probably put the block files for the
542 // wrong network in the directory.
543 serializedNet := byteOrder.Uint32(serializedData[:4])
544 if serializedNet != uint32(s.network) {
545 str := fmt.Sprintf("block data for block %s is for the "+
546 "wrong network - got %d, want %d", hash, serializedNet,
548 return nil, makeDbErr(database.ErrDriverSpecific, str, nil)
551 // The raw block excludes the network, length of the block, and
553 return serializedData[8 : n-4], nil
556 // readBlockRegion reads the specified amount of data at the provided offset for
557 // a given block location. The offset is relative to the start of the
558 // serialized block (as opposed to the beginning of the block record). This
559 // function automatically handles all file management such as opening and
560 // closing files as necessary to stay within the maximum allowed open files
563 // Returns ErrDriverSpecific if the data fails to read for any reason.
564 func (s *blockStore) readBlockRegion(loc blockLocation, offset, numBytes uint32) ([]byte, error) {
565 // Get the referenced block file handle opening the file as needed. The
566 // function also handles closing files as needed to avoid going over the
567 // max allowed open files.
568 blockFile, err := s.blockFile(loc.blockFileNum)
573 // Regions are offsets into the actual block, however the serialized
574 // data for a block includes an initial 4 bytes for network + 4 bytes
575 // for block length. Thus, add 8 bytes to adjust.
576 readOffset := loc.fileOffset + 8 + offset
577 serializedData := make([]byte, numBytes)
578 _, err = blockFile.file.ReadAt(serializedData, int64(readOffset))
581 str := fmt.Sprintf("failed to read region from block file %d, "+
582 "offset %d, len %d: %v", loc.blockFileNum, readOffset,
584 return nil, makeDbErr(database.ErrDriverSpecific, str, err)
587 return serializedData, nil
590 // syncBlocks performs a file system sync on the flat file associated with the
591 // store's current write cursor. It is safe to call even when there is not a
592 // current write file in which case it will have no effect.
594 // This is used when flushing cached metadata updates to disk to ensure all the
595 // block data is fully written before updating the metadata. This ensures the
596 // metadata and block data can be properly reconciled in failure scenarios.
597 func (s *blockStore) syncBlocks() error {
602 // Nothing to do if there is no current file associated with the write
605 defer wc.curFile.RUnlock()
606 if wc.curFile.file == nil {
610 // Sync the file to disk.
611 if err := wc.curFile.file.Sync(); err != nil {
612 str := fmt.Sprintf("failed to sync file %d: %v", wc.curFileNum,
614 return makeDbErr(database.ErrDriverSpecific, str, err)
620 // handleRollback rolls the block files on disk back to the provided file number
621 // and offset. This involves potentially deleting and truncating the files that
622 // were partially written.
624 // There are effectively two scenarios to consider here:
625 // 1) Transient write failures from which recovery is possible
626 // 2) More permanent failures such as hard disk death and/or removal
628 // In either case, the write cursor will be repositioned to the old block file
629 // offset regardless of any other errors that occur while attempting to undo
632 // For the first scenario, this will lead to any data which failed to be undone
633 // being overwritten and thus behaves as desired as the system continues to run.
635 // For the second scenario, the metadata which stores the current write cursor
636 // position within the block files will not have been updated yet and thus if
637 // the system eventually recovers (perhaps the hard drive is reconnected), it
638 // will also lead to any data which failed to be undone being overwritten and
639 // thus behaves as desired.
641 // Therefore, any errors are simply logged at a warning level rather than being
642 // returned since there is nothing more that could be done about it anyways.
643 func (s *blockStore) handleRollback(oldBlockFileNum, oldBlockOffset uint32) {
644 // Grab the write cursor mutex since it is modified throughout this
650 // Nothing to do if the rollback point is the same as the current write
652 if wc.curFileNum == oldBlockFileNum && wc.curOffset == oldBlockOffset {
656 // Regardless of any failures that happen below, reposition the write
657 // cursor to the old block file and offset.
659 wc.curFileNum = oldBlockFileNum
660 wc.curOffset = oldBlockOffset
663 log.Debugf("ROLLBACK: Rolling back to file %d, offset %d",
664 oldBlockFileNum, oldBlockOffset)
666 // Close the current write file if it needs to be deleted. Then delete
667 // all files that are newer than the provided rollback file while
668 // also moving the write cursor file backwards accordingly.
669 if wc.curFileNum > oldBlockFileNum {
671 if wc.curFile.file != nil {
672 _ = wc.curFile.file.Close()
673 wc.curFile.file = nil
677 for ; wc.curFileNum > oldBlockFileNum; wc.curFileNum-- {
678 if err := s.deleteFileFunc(wc.curFileNum); err != nil {
679 log.Warnf("ROLLBACK: Failed to delete block file "+
680 "number %d: %v", wc.curFileNum, err)
685 // Open the file for the current write cursor if needed.
687 if wc.curFile.file == nil {
688 obf, err := s.openWriteFileFunc(wc.curFileNum)
691 log.Warnf("ROLLBACK: %v", err)
694 wc.curFile.file = obf
697 // Truncate the to the provided rollback offset.
698 if err := wc.curFile.file.Truncate(int64(oldBlockOffset)); err != nil {
700 log.Warnf("ROLLBACK: Failed to truncate file %d: %v",
705 // Sync the file to disk.
706 err := wc.curFile.file.Sync()
709 log.Warnf("ROLLBACK: Failed to sync file %d: %v",
715 // scanBlockFiles searches the database directory for all flat block files to
716 // find the end of the most recent file. This position is considered the
717 // current write cursor which is also stored in the metadata. Thus, it is used
718 // to detect unexpected shutdowns in the middle of writes so the block files
719 // can be reconciled.
720 func scanBlockFiles(dbPath string) (int, uint32) {
724 filePath := blockFilePath(dbPath, uint32(i))
725 st, err := os.Stat(filePath)
731 fileLen = uint32(st.Size())
734 log.Tracef("Scan found latest block file #%d with length %d", lastFile,
736 return lastFile, fileLen
739 // newBlockStore returns a new block store with the current block file number
740 // and offset set and all fields initialized.
741 func newBlockStore(basePath string, network wire.BitcoinNet) *blockStore {
742 // Look for the end of the latest block to file to determine what the
743 // write cursor position is from the viewpoing of the block files on
745 fileNum, fileOff := scanBlockFiles(basePath)
751 store := &blockStore{
754 maxBlockFileSize: maxBlockFileSize,
755 openBlockFiles: make(map[uint32]*lockableFile),
756 openBlocksLRU: list.New(),
757 fileNumToLRUElem: make(map[uint32]*list.Element),
759 writeCursor: &writeCursor{
760 curFile: &lockableFile{},
761 curFileNum: uint32(fileNum),
765 store.openFileFunc = store.openFile
766 store.openWriteFileFunc = store.openWriteFile
767 store.deleteFileFunc = store.deleteFile