Contents |
Below is the layout of an FVD image.
| Header |
| Bitmap |
| Journal |
| Table |
| Data Chunk 1 |
| Data Chunk 2 |
| ... |
struct FvdHeader {
uint32_t magic; /* FVD\0 */
uint32_t version;
uint64_t virtual_disk_size; /* in bytes. The disk size perceived by VM. */
uint64_t data_offset; /* in bytes. Where data chunks start. */
/* Data can be optionally stored in a different file. */
char data_file[1024];
char data_file_fmt[16];
/* Base image. */
char base_img[1024];
char base_img_fmt[16];
uint64_t base_img_size; /* in bytes. */
/* Bitmap for base image. */
uint64_t bitmap_offset; /* in bytes. */
uint64_t bitmap_size; /* in bytes. */
uint64_t block_size; /* in bytes. One bit represents the state of one block. */
/* Table for compact image. */
uint64_t table_offset; /* in bytes. */
uint64_t table_size; /* in bytes. */
uint64_t chunk_size; /* in bytes. One table entry maps the address of one chunk. */
uint64_t storage_grow_unit; /* in bytes. */
char add_storage_cmd[1024];
/* Journal */
uint64_t journal_offset; /* in bytes. */
uint64_t journal_size; /* in bytes. */
uint64_t stable_journal_epoch;
uint32_t clean_shutdown; /* true if the disk was closed gracefully last time. */
/* Copy-on-read and prefetching. */
uint32_t copy_on_read; /* true or false */
uint64_t max_outstanding_copy_on_read_data; /* in bytes. */
int64_t prefetch_start_delay; /* in seconds. */
uint32_t base_img_fully_prefetched;
uint32_t num_prefetch_slots;
uint64_t bytes_per_prefetch;
uint64_t prefetch_min_read_throughput; /* in KB/second. */
uint64_t prefetch_max_read_throughput; /* in KB/second. */
uint64_t prefetch_min_write_throughput; /* in KB/second. */
uint64_t prefetch_max_write_throughput; /* in KB/second. */
uint64_t prefetch_throttle_time;
uint64_t prefetch_read_throughput_measure_time; /* in milliseconds. */
uint64_t prefetch_write_throughput_measure_time; /* in milliseconds. */
int32_t need_zero_init; /* Support optional bitmap optimization. */
uint8_t reserved[4096]; /* Future extension. */
} FvdHeader;
Explanation of some header fields:
The storage space for the bitmap is preallocated in the image. A bit is 0 if the corresponding block is in the base image, and the bit is 1 if the block is in the FVD image. The default size of a block is 64KB. To represent the state of a 1TB base image, FVD only needs a 2MB bitmap.
The storage space for the lookup table is preallocated in the image. One entry in the lookup table maps the virtual disk address of a chunk to an offset in the FVD image where the chunk is stored. The default size of a chunk is 1MB. For a 1TB virtual disk, the size of the lookup table is 4MB.
The storage space for the journal is preallocated in the image. The default size of the journal is 16MB. Each sector in the journal is used independently so that it can be updated atomically. A journal sector can contain one or more journal records. Currently, there are two types of journal records, and other types can be added.
struct BitmapUpdateRecord {
uint32_t type; /* BITMAP_JRECORD = 0x3F2AB8ED */
uint32_t num_dirty_sectors;
uint64_t dirty_sector_begin;
}
struct TableUpdateRecord {
uint32_t type; /* TABLE_JRECORD = 0xB4E6F7AC */
uint64_t journal_epoch;
uint32_t num_dirty_table_entries;
uint32_t dirty_table_begin;
uint32_t dirty_table_entries [num_dirty_table_entries];
}
read()
{
if (no_base_image || prefetching_finished || bitmap_indicates_data_in_fvd_image) {
/* Read data from the FVD image. */
if (table_is_disabled) {
Read data directly from the FVD image without address translation;
} else if (table_entry_is_empty) {
Fill the data buffer with zeros;
} else {
Use the table entry to locate the data and read them from the FVD image;
}
} else {
Read data from the base image;
Return data to VM, and invoke callback to acknowledge the completion of read.
/* copy-on-read is not on the critical path of read. */
if (copy_on_read) {
store_data(); /* See below. Make a copy of the data in the FVD image. */
Update in-memory bitmap and table, but unlike copy-on-write, regardless of
the cache setting (writethrough, writeback, or none), do not save metadata
changes to the journal now. This reduces overhead.
}
}
}
write()
{
if (no_base_image || prefetching_finished || bitmap_indicates_data_in_fvd_image) {
/* No need to read data from the base image. Store data in the FVD image directly. */
store_data(); /* See below. */
} else {
Construct complete blocks by reading data from the
base image and merging them with the new data to be written;
store_data(); /* See below. */
}
if (table_updated || bitmap_updated) {
if (cache == writethrough) {
/* Write metadata changes to the journal immediately. */
if (journal_is_full) {
write_table_to_disk();
flush();
write_bitmap_to_disk();
flush(); /* The journal now can be reused. */
}
Allocate a free journal sector;
Write metadata changes to the journal sector;
} else {
/* cache==writeback or cache==none. */
Record metadata changes in memory, which will be written to the journal on
the next flush operation initiated by the VM or after a timeout.
}
}
}
store_data()
{
if (table_is_disabled) {
Store data directly in the FVD image without address translation;
} else {
if (table_entry_is_empty) {
Allocate a new chunk to host the data;
}
Store data in the FVD image at the location specified by the table entry;
}
}
recover_journal()
{
if (header.clean_shutdown) {
return; /* No need to recover from the journal. */
}
for_each_sector_in_journal() {
for_each_record_in_sector() {
if (it_is_a_table_journal_record) {
if (this_record.journal_epoch > header.stable_journal_epoch) {
Use the record to update the table;
}
} else if (it_is_a_bitmap_journal_record) {
Use the record to update the bitmap;
}
}
}
write_table_to_disk();
write_bitmap_to_disk();
flush();
}