diff --git a/zstd/_generate/gen.go b/zstd/_generate/gen.go index 1914b0ec7c..b61a637020 100644 --- a/zstd/_generate/gen.go +++ b/zstd/_generate/gen.go @@ -561,7 +561,7 @@ type executeSimple struct{} // // See copyMemory() func (e executeSimple) copySize() int { - return 16 + return 32 } func (e executeSimple) generateProcedure(name string) { @@ -716,14 +716,19 @@ func (e executeSimple) generateProcedure(name string) { func (e executeSimple) copyMemory(suffix string, src, dst, length reg.GPVirtual) { label := "copy_" + suffix ofs := GP64() - s := Mem{Base: src, Index: ofs, Scale: 1} - d := Mem{Base: dst, Index: ofs, Scale: 1} + s0 := Mem{Base: src, Index: ofs, Scale: 1} + s1 := Mem{Base: src, Index: ofs, Scale: 1, Disp: 16} + d0 := Mem{Base: dst, Index: ofs, Scale: 1} + d1 := Mem{Base: dst, Index: ofs, Scale: 1, Disp: 16} XORQ(ofs, ofs) Label(label) - t := XMM() - MOVUPS(s, t) - MOVUPS(t, d) + t0 := XMM() + t1 := XMM() + MOVUPS(s0, t0) + MOVUPS(s1, t1) + MOVUPS(t0, d0) + MOVUPS(t1, d1) ADDQ(U8(e.copySize()), ofs) CMPQ(ofs, length) JB(LabelRef(label)) diff --git a/zstd/seqdec_amd64.s b/zstd/seqdec_amd64.s index bee3fd8a04..699a19723d 100644 --- a/zstd/seqdec_amd64.s +++ b/zstd/seqdec_amd64.s @@ -619,7 +619,7 @@ main_loop: // Check if we won't overflow ctx.out while fast copying LEAQ (R10)(R11*1), R12 - LEAQ 16(R8)(R12*1), R13 + LEAQ 32(R8)(R12*1), R13 CMPQ R13, SI JA slow_path @@ -634,8 +634,10 @@ main_loop: copy_1: MOVUPS (DI)(R12*1), X0 + MOVUPS 16(DI)(R12*1), X1 MOVUPS X0, (BX)(R12*1) - ADDQ $0x10, R12 + MOVUPS X1, 16(BX)(R12*1) + ADDQ $0x20, R12 CMPQ R12, R11 JB copy_1 ADDQ R11, DI @@ -658,8 +660,10 @@ copy_match: copy_2: MOVUPS (R12)(R11*1), X0 + MOVUPS 16(R12)(R11*1), X1 MOVUPS X0, (BX)(R11*1) - ADDQ $0x10, R11 + MOVUPS X1, 16(BX)(R11*1) + ADDQ $0x20, R11 CMPQ R11, R10 JB copy_2 ADDQ R10, BX