Skip to content

Commit

Permalink
aarch64: fix zfs support
Browse files Browse the repository at this point in the history
This patch fixes ZFS support on aarch64. As the issue #1131 explains,
the ZFS page scanner logic clears the access flag of PTEs of relevant
memory-mapped chunks of the files. On Intel, the cpu automatically sets
the flags on first access (read or write) to those pages of memory.
But on ARM it may need to be done by software if CPU does not have this
capability (it does not on RPI 4 and Odroid I have been using possibly
due to QEMU limitation).

So to set the access flags in software, this patch enhances the page
fault handler to detect if relevant fault is access flag related
and does the manual page walk to navigate all the way down to the leaf
PTE based on the virtual memory address retrieved from far_el1.
Then it sets the access flag of the PTE and the dirty flag if the fault
was triggered by a write. Eventually it writes the PTE back to memory
and issues necessary `dsb ishst` to force completion of writes to page
table entries and flush cpu pipeline.

Finally, this patch adjusts `scripts/build` to support building ZFS
images on arm and makes ZFS a default filesystem as on x64_64.

Besides running all unit tests on ZFS image I have also verified that
more involved tests like misc-zfs-io.cc work as well.

Fixes #1131

Signed-off-by: Waldemar Kozaczuk <jwkozaczuk@gmail.com>
  • Loading branch information
wkozaczuk committed May 4, 2022
1 parent d8d2719 commit ea9cb44
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 13 deletions.
57 changes: 57 additions & 0 deletions arch/aarch64/mmu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,59 @@
#include "arch-cpu.hh"
#include "exceptions.hh"

#define ACCESS_FLAG_FAULT_LEVEL_3(esr) ((esr & 0b0111111) == 0x0b) // 0xb = 0b1011 indicates level 3
#define ACCESS_FLAG_FAULT_LEVEL_3_WHEN_WRITE(esr) ((esr & 0b1111111) == 0x4b)

TRACEPOINT(trace_mmu_vm_access_flag_fault, "addr=%p", void *);

template <typename T>
T* phys_to_virt_cast(mmu::phys pa)
{
void *virt = mmu::phys_mem + pa;
return static_cast<T*>(virt);
}

static void handle_access_flag_fault(exception_frame *ef, u64 addr) {
trace_mmu_vm_access_flag_fault((void*)addr);

// The access bit of a PTE (Page Table Entry) at level 3 got cleared and we need
// to set it to handle this page fault. Therefore we need to do a page walk
// to navigate down to the level 3 and identify relevant PTE.

// Start with root PTE
auto root_pt = mmu::get_root_pt(addr);
auto root_ptep = mmu::hw_ptep<4>::force(root_pt);

// Identify PTEP (PTE Pointer) at level 0 (the template parameter is reversed)
// First identify the ptep table at this level
auto l3_ptep_table = mmu::hw_ptep<3>::force(phys_to_virt_cast<mmu::pt_element<3>>(root_ptep.read().next_pt_addr()));
// Then access ptep at the index encoded in the virtual address
auto l3_ptep = l3_ptep_table.at(mmu::pt_index(reinterpret_cast<void*>(addr), 3));

// Identify PTEP at level 1 (first identify the ptep table and then the relevant ptep)
auto l2_ptep_table = mmu::hw_ptep<2>::force(phys_to_virt_cast<mmu::pt_element<2>>(l3_ptep.read().next_pt_addr()));
auto l2_ptep = l2_ptep_table.at(mmu::pt_index(reinterpret_cast<void*>(addr), 2));

// Identify PTEP at level 2 (first identify the ptep table and then the relevant ptep)
auto l1_ptep_table = mmu::hw_ptep<1>::force(phys_to_virt_cast<mmu::pt_element<1>>(l2_ptep.read().next_pt_addr()));
auto l1_ptep = l1_ptep_table.at(mmu::pt_index(reinterpret_cast<void*>(addr), 1));

// Identify PTEP at level 3 (first identify the ptep table and then the relevant ptep)
auto l0_ptep_table = mmu::hw_ptep<0>::force(phys_to_virt_cast<mmu::pt_element<0>>(l1_ptep.read().next_pt_addr()));
auto l0_ptep = l0_ptep_table.at(mmu::pt_index(reinterpret_cast<void*>(addr), 0));

// Read leaf PTE
auto leaf_pte = l0_ptep.read();

leaf_pte.set_accessed(true);
if (ACCESS_FLAG_FAULT_LEVEL_3(ef->esr)) {
leaf_pte.set_dirty(true);
}

l0_ptep.write(leaf_pte);
mmu::synchronize_page_table_modifications();
}

void page_fault(exception_frame *ef)
{
sched::fpu_lock fpu;
Expand All @@ -39,6 +92,10 @@ void page_fault(exception_frame *ef)
abort("trying to execute null pointer");
}

if (ACCESS_FLAG_FAULT_LEVEL_3(ef->esr)) {
return handle_access_flag_fault(ef, addr);
}

/* vm_fault might sleep, so check that the thread is preemptable,
* and that interrupts in the saved pstate are enabled.
* Then enable interrupts for the vm_fault.
Expand Down
20 changes: 9 additions & 11 deletions scripts/build
Original file line number Diff line number Diff line change
Expand Up @@ -190,15 +190,7 @@ host_arch=$(uname -m)

# Default manifest
manifest=bootfs.manifest.skel
if [[ "$host_arch" == "aarch64" || "$arch" == "aarch64" ]]; then
# We default to ROFS as ZFS is not supported on ARM until the issue #1131 is fixed
fs_type=${vars[fs]-rofs}
if [[ "$fs_type" == "rofs" ]]; then
vars[create_disk]="true"
fi
else
fs_type=${vars[fs]-zfs}
fi
fs_type=${vars[fs]-zfs}
usrskel_arg=
case $fs_type in
zfs)
Expand All @@ -215,6 +207,10 @@ ramfs)
exit 2
esac

if [[ "$host_arch" == "aarch64" || "$arch" == "aarch64" ]]; then
vars[create_disk]="true"
fi

if test -n "${vars[usrskel]}"
then
# Override default skel
Expand Down Expand Up @@ -305,7 +301,9 @@ if [[ ${vars[create_disk]} == "true" ]]; then
bare="$SRC"/scripts/disk.bin
raw_disk=disk
qcow2_disk=disk
upload_kernel_mode="-k"
if [[ "$arch" == 'x64' ]]; then
upload_kernel_mode="-k"
fi
else
partition_offset=$kernel_end
bare=loader.img
Expand All @@ -318,7 +316,7 @@ create_zfs_disk() {
"$SRC"/scripts/imgedit.py setpartition "-f raw ${raw_disk}.raw" 2 $partition_offset $partition_size
qemu-img convert -f raw -O qcow2 $raw_disk.raw $qcow2_disk.img
qemu-img resize $qcow2_disk.img ${image_size}b >/dev/null 2>&1
"$SRC"/scripts/upload_manifest.py -o $qcow2_disk.img -m usr.manifest -D libgcc_s_dir="$libgcc_s_dir" $upload_kernel_mode
"$SRC"/scripts/upload_manifest.py --arch=$arch -o $qcow2_disk.img -m usr.manifest -D libgcc_s_dir="$libgcc_s_dir" $upload_kernel_mode
}

create_rofs_disk() {
Expand Down
10 changes: 8 additions & 2 deletions scripts/upload_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import io
StringIO = io.StringIO

host_arch = os.uname().machine

def find_free_port():
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
s.bind(('localhost', 0))
Expand Down Expand Up @@ -137,7 +139,11 @@ def main():
make_option('-k',
dest='kernel',
action='store_true',
help='run OSv in direct kernel mode')
help='run OSv in direct kernel mode'),
make_option('--arch',
dest='arch',
default=host_arch,
help="specify QEMU architecture: x86_64, aarch64")
])

(options, args) = opt.parse_args()
Expand All @@ -155,7 +161,7 @@ def main():
kernel_mode_flag = '-k --kernel-path build/release/loader-stripped.elf'
else:
kernel_mode_flag = ''
osv = subprocess.Popen('cd ../..; scripts/run.py %s --vnc none -m 512 -c1 -i "%s" --block-device-cache unsafe -s -e "--norandom --nomount --noinit /tools/mkfs.so; /tools/cpiod.so --prefix /zfs/zfs/; /zfs.so set compression=off osv" --forward tcp:127.0.0.1:%s-:10000' % (kernel_mode_flag,image_path,upload_port), shell=True, stdout=subprocess.PIPE)
osv = subprocess.Popen('cd ../..; scripts/run.py %s --arch=%s --vnc none -m 512 -c1 -i "%s" --block-device-cache unsafe -s -e "--norandom --nomount --noinit /tools/mkfs.so; /tools/cpiod.so --prefix /zfs/zfs/; /zfs.so set compression=off osv" --forward tcp:127.0.0.1:%s-:10000' % (kernel_mode_flag,options.arch,image_path,upload_port), shell=True, stdout=subprocess.PIPE)

upload(osv, manifest, depends, upload_port)

Expand Down

0 comments on commit ea9cb44

Please sign in to comment.