diff --git a/runtime/codert_vm/pnathelp.m4 b/runtime/codert_vm/pnathelp.m4 index 1ae5dd6695b..2b79d494700 100644 --- a/runtime/codert_vm/pnathelp.m4 +++ b/runtime/codert_vm/pnathelp.m4 @@ -1,4 +1,4 @@ -dnl Copyright (c) 2017, 2018 IBM Corp. and others +dnl Copyright (c) 2017, 2019 IBM Corp. and others dnl dnl This program and the accompanying materials are made available under dnl the terms of the Eclipse Public License 2.0 which accompanies this @@ -852,6 +852,63 @@ START_PROC(jitFillOSRBufferReturn) CINTERP(J9TR_bcloop_exit_interpreter, 0) END_PROC(jitFillOSRBufferReturn) +dnl Expects r3 to already contain the vmThread. +dnl Expects r4 to already contain the address being loaded from. +START_PROC(jitReadBarrier) +ifdef({OMR_GC_CONCURRENT_SCAVENGER},{ + staddr r0,JIT_GPR_SAVE_SLOT(0) + SAVE_LR + SAVE_JIT_GOT_REGISTERS + INIT_JIT_TOC + SWITCH_TO_C_STACK + SAVE_C_VOLATILE_REGS + + laddr r5,J9TR_VMThread_javaVM(J9VMTHREAD) + laddr r5,J9TR_JavaVM_memoryManagerFunctions(r5) + laddr FUNC_PTR,J9TR_J9MemoryManagerFunctions_J9ReadBarrier(r5) + CALL_INDIRECT + + RESTORE_C_VOLATILE_REGS + RESTORE_JIT_GOT_REGISTERS + RESTORE_LR + SWITCH_TO_JAVA_STACK + laddr r0,JIT_GPR_SAVE_SLOT(0) + blr +},{ +dnl jitReadBarrier is not supported if OMR_GC_CONCURRENT_SCAVENGER is not set + trap +}) +END_PROC(jitReadBarrier) + +dnl Expects r3 to already contain vmThread. +dnl Expects r4 to already contain srcObj. +dnl Expects r5 to already contain dstObj. +dnl Expects r6 to already contain srcAddr. +dnl Expects r7 to already contain dstAddr. +dnl Expects r8 to already contain length. +START_PROC(jitReferenceArrayCopy) + staddr r0,JIT_GPR_SAVE_SLOT(0) + SAVE_LR + SAVE_JIT_GOT_REGISTERS + INIT_JIT_TOC + SWITCH_TO_C_STACK + SAVE_ALL_REGS + + laddr r9,J9TR_VMThread_javaVM(J9VMTHREAD) + laddr r9,J9TR_JavaVM_memoryManagerFunctions(r9) + laddr FUNC_PTR,J9TR_J9MemoryManagerFunctions_referenceArrayCopy(r9) + CALL_INDIRECT + staddr r3,J9TR_VMThread_returnValue(J9VMTHREAD) + + RESTORE_ALL_REGS + RESTORE_JIT_GOT_REGISTERS + RESTORE_LR + SWITCH_TO_JAVA_STACK + laddr r0,JIT_GPR_SAVE_SLOT(0) + laddr r3,J9TR_VMThread_returnValue(J9VMTHREAD) + blr +END_PROC(jitReferenceArrayCopy) + ifdef({ASM_J9VM_ENV_DATA64},{ BEGIN_RETURN_POINT(jitExitInterpreterJ) diff --git a/runtime/compiler/env/VMJ9.cpp b/runtime/compiler/env/VMJ9.cpp index f7ad66bbeb8..b1c55b7d34b 100644 --- a/runtime/compiler/env/VMJ9.cpp +++ b/runtime/compiler/env/VMJ9.cpp @@ -1920,11 +1920,11 @@ UDATA TR_J9VMBase::thisThreadGetConcurrentScavengeActiveByteAddressOffset() UDATA TR_J9VMBase::thisThreadGetEvacuateBaseAddressOffset() { #if defined(OMR_GC_CONCURRENT_SCAVENGER) -#if defined(J9VM_GC_COMPRESSED_POINTERS) && defined(TR_TARGET_X86) +#if defined(J9VM_GC_COMPRESSED_POINTERS) && !defined(TR_TARGET_S390) return offsetof(J9VMThread, readBarrierRangeCheckBaseCompressed); #else return offsetof(J9VMThread, readBarrierRangeCheckBase); -#endif /* defined(J9VM_GC_COMPRESSED_POINTERS) && defined(TR_TARGET_X86) */ +#endif /* defined(J9VM_GC_COMPRESSED_POINTERS) && !defined(TR_TARGET_S390) */ #else /* defined(OMR_GC_CONCURRENT_SCAVENGER) */ TR_ASSERT(0,"Field readBarrierRangeCheckBase does not exists in J9VMThread."); return 0; @@ -1937,11 +1937,11 @@ UDATA TR_J9VMBase::thisThreadGetEvacuateBaseAddressOffset() UDATA TR_J9VMBase::thisThreadGetEvacuateTopAddressOffset() { #if defined(OMR_GC_CONCURRENT_SCAVENGER) -#if defined(J9VM_GC_COMPRESSED_POINTERS) && defined(TR_TARGET_X86) +#if defined(J9VM_GC_COMPRESSED_POINTERS) && !defined(TR_TARGET_S390) return offsetof(J9VMThread, readBarrierRangeCheckTopCompressed); #else return offsetof(J9VMThread, readBarrierRangeCheckTop); -#endif /* defined(J9VM_GC_COMPRESSED_POINTERS) && defined(TR_TARGET_X86) */ +#endif /* defined(J9VM_GC_COMPRESSED_POINTERS) && !defined(TR_TARGET_S390) */ #else /* defined(OMR_GC_CONCURRENT_SCAVENGER) */ TR_ASSERT(0,"Field readBarrierRangeCheckTop does not exists in J9VMThread."); return 0; diff --git a/runtime/compiler/p/codegen/J9CodeGenerator.cpp b/runtime/compiler/p/codegen/J9CodeGenerator.cpp index 11714742a57..813edcc2f36 100644 --- a/runtime/compiler/p/codegen/J9CodeGenerator.cpp +++ b/runtime/compiler/p/codegen/J9CodeGenerator.cpp @@ -619,7 +619,10 @@ J9::Power::CodeGenerator::insertPrefetchIfNecessary(TR::Node *node, TR::Register } if (node->getOpCodeValue() == TR::aloadi || - (TR::Compiler->target.is64Bit() && comp()->useCompressedPointers() && node->getOpCodeValue() == TR::iloadi && comp()->getMethodHotness() >= hot)) + (TR::Compiler->target.is64Bit() && + comp()->useCompressedPointers() && + (node->getOpCodeValue() == TR::iloadi || node->getOpCodeValue() == TR::irdbari) && + comp()->getMethodHotness() >= hot)) { TR::Node *firstChild = node->getFirstChild(); optDisabled = disableIteratorPrefetch; diff --git a/runtime/compiler/p/codegen/J9TreeEvaluator.cpp b/runtime/compiler/p/codegen/J9TreeEvaluator.cpp index c64f30d84ea..d427346ff8f 100644 --- a/runtime/compiler/p/codegen/J9TreeEvaluator.cpp +++ b/runtime/compiler/p/codegen/J9TreeEvaluator.cpp @@ -1188,6 +1188,270 @@ TR::Register *J9::Power::TreeEvaluator::awrtbariEvaluator(TR::Node *node, TR::Co return NULL; } +TR::Register *J9::Power::TreeEvaluator::irdbarEvaluator(TR::Node *node, TR::CodeGenerator *cg) + { + TR_ASSERT(TR::Compiler->om.shouldGenerateReadBarriersForFieldLoads(), "iReadBarrierEvaluator"); +#ifdef OMR_GC_CONCURRENT_SCAVENGER + TR_ASSERT(cg->comp()->useCompressedPointers(), "irdbarEvaluator is expecting compressed references"); + + TR::Compilation *comp = cg->comp(); + TR::MemoryReference *tempMR = NULL; + + TR::Register *objReg = cg->allocateRegister(); + TR::Register *locationReg = cg->allocateRegister(); + TR::Register *evacuateReg = cg->allocateRegister(); + TR::Register *r3Reg = cg->allocateRegister(); + TR::Register *r11Reg = cg->allocateRegister(); + TR::Register *metaReg = cg->getMethodMetaDataRegister(); + TR::Register *condReg = cg->allocateRegister(TR_CCR); + + TR::LabelSymbol *startLabel = generateLabelSymbol(cg); + TR::LabelSymbol *endLabel = generateLabelSymbol(cg); + startLabel->setStartInternalControlFlow(); + endLabel->setEndInternalControlFlow(); + + TR::RegisterDependencyConditions *deps = new (cg->trHeapMemory()) TR::RegisterDependencyConditions(0, 7, cg->trMemory()); + deps->addPostCondition(objReg, TR::RealRegister::NoReg); + deps->addPostCondition(locationReg, TR::RealRegister::gr4); //TR_readBarrier helper needs this in gr4. + deps->addPostCondition(evacuateReg, TR::RealRegister::NoReg); + deps->addPostCondition(r3Reg, TR::RealRegister::gr3); + deps->addPostCondition(r11Reg, TR::RealRegister::gr11); + deps->addPostCondition(metaReg, TR::RealRegister::NoReg); + deps->addPostCondition(condReg, TR::RealRegister::NoReg); + + if (node->getSymbolReference()->getSymbol()->isInternalPointer()) + { + objReg->setPinningArrayPointer(node->getSymbolReference()->getSymbol()->castToInternalPointerAutoSymbol()->getPinningArrayPointer()); + objReg->setContainsInternalPointer(); + } + + node->setRegister(objReg); + bool needSync = (node->getSymbolReference()->getSymbol()->isSyncVolatile() && TR::Compiler->target.isSMP()); + + // If the reference is volatile or potentially volatile, we keep a fixed instruction + // layout in order to patch if it turns out that the reference isn't really volatile. + tempMR = new (cg->trHeapMemory()) TR::MemoryReference(node, 4, cg); + if (tempMR->getIndexRegister() != NULL) + generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, locationReg, tempMR->getBaseRegister(), tempMR->getIndexRegister()); + else + generateTrg1MemInstruction(cg, TR::InstOpCode::addi2, node, locationReg, tempMR); + + /* + * Loading the effective address does not actually need a sync instruction. + * However, this point sometimes gets patched so a nop is needed here until all the patching code is updated. + */ + //TODO: Patching code needs to be updated to know not to expect a sync instruction here. + if (needSync) + { + TR::TreeEvaluator::postSyncConditions(node, cg, locationReg, tempMR, TR::InstOpCode::nop); + } + + // For compr refs caseL if monitored loads is supported and we are loading an address, use monitored loads instruction + if (node->getOpCodeValue() == TR::irdbari && cg->getSupportsLM() && node->getSymbolReference()->getSymbol()->getDataType() == TR::Address) + { + TR::MemoryReference *monitoredLoadMR = new (cg->trHeapMemory()) TR::MemoryReference(locationReg, 0, 4, cg); + monitoredLoadMR->forceIndexedForm(node, cg); + generateTrg1MemInstruction(cg, TR::InstOpCode::lwzmx, node, objReg, monitoredLoadMR); + } + else + { + generateTrg1MemInstruction(cg, TR::InstOpCode::lwz, node, objReg, new (cg->trHeapMemory()) TR::MemoryReference(locationReg, 0, 4, cg)); + } + + generateLabelInstruction(cg, TR::InstOpCode::label, node, startLabel); + generateTrg1MemInstruction(cg, TR::InstOpCode::lwz, node, evacuateReg, + new (cg->trHeapMemory()) TR::MemoryReference(metaReg, cg->comp()->fej9()->thisThreadGetEvacuateBaseAddressOffset(), 4, cg)); + generateTrg1Src2Instruction(cg, TR::InstOpCode::cmpl4, node, condReg, objReg, evacuateReg); + generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, endLabel, condReg); + + generateTrg1MemInstruction(cg, TR::InstOpCode::lwz, node, evacuateReg, + new (cg->trHeapMemory()) TR::MemoryReference(metaReg, cg->comp()->fej9()->thisThreadGetEvacuateTopAddressOffset(), 4, cg)); + generateTrg1Src2Instruction(cg, TR::InstOpCode::cmpl4, node, condReg, objReg, evacuateReg); + generateConditionalBranchInstruction(cg, TR::InstOpCode::bgt, node, endLabel, condReg); + + // TR_readBarrier helper expects the vmThread in r3. + generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, node, r3Reg, metaReg); + + TR::SymbolReference *helperSym = comp->getSymRefTab()->findOrCreateRuntimeHelper(TR_readBarrier, false, false, false); + generateDepImmSymInstruction(cg, TR::InstOpCode::bl, node, (uintptrj_t)helperSym->getMethodAddress(), deps, helperSym); + + // For compr refs caseL if monitored loads is supported and we are loading an address, use monitored loads instruction + if (node->getOpCodeValue() == TR::irdbari && cg->getSupportsLM() && node->getSymbolReference()->getSymbol()->getDataType() == TR::Address) + { + TR::MemoryReference *monitoredLoadMR = new (cg->trHeapMemory()) TR::MemoryReference(locationReg, 0, 4, cg); + monitoredLoadMR->forceIndexedForm(node, cg); + generateTrg1MemInstruction(cg, TR::InstOpCode::lwzmx, node, objReg, monitoredLoadMR); + } + else + { + generateTrg1MemInstruction(cg, TR::InstOpCode::lwz, node, objReg, new (cg->trHeapMemory()) TR::MemoryReference(locationReg, 0, 4, cg)); + } + + generateDepLabelInstruction(cg, TR::InstOpCode::label, node, endLabel, deps); + + //TODO: Patching code needs to be updated to be able to patch out these new sync instructions. + if (needSync) + { + generateInstruction(cg, TR::Compiler->target.cpu.id() >= TR_PPCp7 ? TR::InstOpCode::lwsync : TR::InstOpCode::isync, node); + } + + cg->insertPrefetchIfNecessary(node, objReg); + + tempMR->decNodeReferenceCounts(cg); + + cg->stopUsingRegister(evacuateReg); + cg->stopUsingRegister(locationReg); + cg->stopUsingRegister(r3Reg); + cg->stopUsingRegister(r11Reg); + cg->stopUsingRegister(condReg); + + cg->machine()->setLinkRegisterKilled(true); + + return objReg; +#else + return NULL; +#endif + } + +TR::Register *J9::Power::TreeEvaluator::ardbarEvaluator(TR::Node *node, TR::CodeGenerator *cg) + { + TR_ASSERT(TR::Compiler->om.shouldGenerateReadBarriersForFieldLoads(), "aReadBarrierEvaluator"); +#ifdef OMR_GC_CONCURRENT_SCAVENGER + TR::Compilation *comp = cg->comp(); + TR::MemoryReference *tempMR = NULL; + + TR::Register *tempReg; + TR::Register *locationReg = cg->allocateRegister(); + TR::Register *evacuateReg = cg->allocateRegister(); + TR::Register *r3Reg = cg->allocateRegister(); + TR::Register *r11Reg = cg->allocateRegister(); + TR::Register *metaReg = cg->getMethodMetaDataRegister(); + TR::Register *condReg = cg->allocateRegister(TR_CCR); + + if (!node->getSymbolReference()->getSymbol()->isInternalPointer()) + { + if (node->getSymbolReference()->getSymbol()->isNotCollected()) + tempReg = cg->allocateRegister(); + else + tempReg = cg->allocateCollectedReferenceRegister(); + } + else + { + tempReg = cg->allocateRegister(); + tempReg->setPinningArrayPointer(node->getSymbolReference()->getSymbol()->castToInternalPointerAutoSymbol()->getPinningArrayPointer()); + tempReg->setContainsInternalPointer(); + } + + TR::LabelSymbol *startLabel = generateLabelSymbol(cg); + TR::LabelSymbol *endLabel = generateLabelSymbol(cg); + startLabel->setStartInternalControlFlow(); + endLabel->setEndInternalControlFlow(); + + TR::RegisterDependencyConditions *deps = new (cg->trHeapMemory()) TR::RegisterDependencyConditions(0, 7, cg->trMemory()); + deps->addPostCondition(tempReg, TR::RealRegister::NoReg); + deps->addPostCondition(locationReg, TR::RealRegister::gr4); //TR_readBarrier helper needs this in gr4. + deps->addPostCondition(evacuateReg, TR::RealRegister::NoReg); + deps->addPostCondition(r3Reg, TR::RealRegister::gr3); + deps->addPostCondition(r11Reg, TR::RealRegister::gr11); + deps->addPostCondition(metaReg, TR::RealRegister::NoReg); + deps->addPostCondition(condReg, TR::RealRegister::NoReg); + + // If the reference is volatile or potentially volatile, we keep a fixed instruction + // layout in order to patch if it turns out that the reference isn't really volatile. + + bool needSync = (node->getSymbolReference()->getSymbol()->isSyncVolatile() && TR::Compiler->target.isSMP()); + + tempMR = new (cg->trHeapMemory()) TR::MemoryReference(node, TR::Compiler->om.sizeofReferenceAddress(), cg); + + node->setRegister(tempReg); + + if (tempMR->getIndexRegister() != NULL) + generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, locationReg, tempMR->getBaseRegister(), tempMR->getIndexRegister()); + else + generateTrg1MemInstruction(cg, TR::InstOpCode::addi2, node, locationReg, tempMR); + + /* + * Loading the effective address does not actually need a sync instruction. + * However, this point sometimes gets patched so a nop is needed here until all the patching code is updated. + */ + //TODO: Patching code needs to be updated to know not to expect a sync instruction here. + if (needSync) + { + TR::TreeEvaluator::postSyncConditions(node, cg, locationReg, tempMR, TR::InstOpCode::nop); + } + + // if monitored loads is supported and we are loading an address, use monitored loads instruction + if (node->getOpCodeValue() == TR::ardbari && cg->getSupportsLM() && node->getSymbolReference()->getSymbol()->getDataType() == TR::Address) + { + TR::MemoryReference *monitoredLoadMR = new (cg->trHeapMemory()) TR::MemoryReference(locationReg, 0, TR::Compiler->om.sizeofReferenceAddress(), cg); + monitoredLoadMR->forceIndexedForm(node, cg); + generateTrg1MemInstruction(cg, TR::Compiler->target.is64Bit() ? TR::InstOpCode::ldmx : TR::InstOpCode::lwzmx, node, tempReg, monitoredLoadMR); + } + else + { + generateTrg1MemInstruction(cg, TR::InstOpCode::Op_load, node, tempReg, new (cg->trHeapMemory()) TR::MemoryReference(locationReg, 0, TR::Compiler->om.sizeofReferenceAddress(), cg)); + } + + if (node->getSymbolReference() == comp->getSymRefTab()->findVftSymbolRef()) + TR::TreeEvaluator::generateVFTMaskInstruction(cg, node, tempReg); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, startLabel); + + generateTrg1MemInstruction(cg, TR::InstOpCode::Op_load, node, evacuateReg, + new (cg->trHeapMemory()) TR::MemoryReference(metaReg, cg->comp()->fej9()->thisThreadGetEvacuateBaseAddressOffset(), TR::Compiler->om.sizeofReferenceAddress(), cg)); + generateTrg1Src2Instruction(cg, TR::InstOpCode::Op_cmpl, node, condReg, tempReg, evacuateReg); + generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, endLabel, condReg); + + generateTrg1MemInstruction(cg, TR::InstOpCode::Op_load, node, evacuateReg, + new (cg->trHeapMemory()) TR::MemoryReference(metaReg, cg->comp()->fej9()->thisThreadGetEvacuateTopAddressOffset(), TR::Compiler->om.sizeofReferenceAddress(), cg)); + generateTrg1Src2Instruction(cg, TR::InstOpCode::Op_cmpl, node, condReg, tempReg, evacuateReg); + generateConditionalBranchInstruction(cg, TR::InstOpCode::bgt, node, endLabel, condReg); + + // TR_readBarrier helper expects the vmThread in r3. + generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, node, r3Reg, metaReg); + + TR::SymbolReference *helperSym = comp->getSymRefTab()->findOrCreateRuntimeHelper(TR_readBarrier, false, false, false); + generateDepImmSymInstruction(cg, TR::InstOpCode::bl, node, (uintptrj_t)helperSym->getMethodAddress(), deps, helperSym); + + // if monitored loads is supported and we are loading an address, use monitored loads instruction + if (node->getOpCodeValue() == TR::ardbari && cg->getSupportsLM() && node->getSymbolReference()->getSymbol()->getDataType() == TR::Address) + { + TR::MemoryReference *monitoredLoadMR = new (cg->trHeapMemory()) TR::MemoryReference(locationReg, 0, TR::Compiler->om.sizeofReferenceAddress(), cg); + monitoredLoadMR->forceIndexedForm(node, cg); + generateTrg1MemInstruction(cg, TR::Compiler->target.is64Bit() ? TR::InstOpCode::ldmx : TR::InstOpCode::lwzmx, node, tempReg, monitoredLoadMR); + } + else + { + generateTrg1MemInstruction(cg, TR::InstOpCode::Op_load, node, tempReg, new (cg->trHeapMemory()) TR::MemoryReference(locationReg, 0, TR::Compiler->om.sizeofReferenceAddress(), cg)); + } + + if (node->getSymbolReference() == comp->getSymRefTab()->findVftSymbolRef()) + TR::TreeEvaluator::generateVFTMaskInstruction(cg, node, tempReg); + + generateDepLabelInstruction(cg, TR::InstOpCode::label, node, endLabel, deps); + + //TODO: Patching code needs to be updated to be able to patch out these new sync instructions. + if (needSync) + { + generateInstruction(cg, TR::Compiler->target.cpu.id() >= TR_PPCp7 ? TR::InstOpCode::lwsync : TR::InstOpCode::isync, node); + } + + tempMR->decNodeReferenceCounts(cg); + + cg->stopUsingRegister(evacuateReg); + cg->stopUsingRegister(locationReg); + cg->stopUsingRegister(r3Reg); + cg->stopUsingRegister(r11Reg); + cg->stopUsingRegister(condReg); + + cg->machine()->setLinkRegisterKilled(true); + + return tempReg; +#else + return NULL; +#endif + } + TR::Register *J9::Power::TreeEvaluator::monentEvaluator(TR::Node *node, TR::CodeGenerator *cg) { return TR::TreeEvaluator::VMmonentEvaluator(node, cg); @@ -8546,6 +8810,79 @@ static TR::Register *VMinlineCompareAndSwapObject(TR::Node *node, TR::CodeGenera else wrtBarEndLabel = doneLabel; +#ifdef OMR_GC_CONCURRENT_SCAVENGER + if (TR::Compiler->om.shouldGenerateReadBarriersForFieldLoads()) + { + TR::Register *tmpReg = cg->allocateRegister(); + TR::Register *locationReg = cg->allocateRegister(); + TR::Register *evacuateReg = cg->allocateRegister(); + TR::Register *r3Reg = cg->allocateRegister(); + TR::Register *r11Reg = cg->allocateRegister(); + TR::Register *metaReg = cg->getMethodMetaDataRegister(); + + TR::LabelSymbol *startReadBarrierLabel = generateLabelSymbol(cg); + TR::LabelSymbol *endReadBarrierLabel = generateLabelSymbol(cg); + + TR::RegisterDependencyConditions *deps = new (cg->trHeapMemory()) TR::RegisterDependencyConditions(0, 9, cg->trMemory()); + deps->addPostCondition(objReg, TR::RealRegister::NoReg); + deps->getPostConditions()->getRegisterDependency(0)->setExcludeGPR0(); + deps->addPostCondition(offsetReg, TR::RealRegister::NoReg); + deps->addPostCondition(tmpReg, TR::RealRegister::NoReg); + deps->addPostCondition(locationReg, TR::RealRegister::gr4); //TR_readBarrier helper needs this in gr4. + deps->addPostCondition(evacuateReg, TR::RealRegister::NoReg); + deps->addPostCondition(r3Reg, TR::RealRegister::gr3); + deps->addPostCondition(r11Reg, TR::RealRegister::gr11); + deps->addPostCondition(metaReg, TR::RealRegister::NoReg); + deps->addPostCondition(cndReg, TR::RealRegister::NoReg); + + startReadBarrierLabel->setStartInternalControlFlow(); + endReadBarrierLabel->setEndInternalControlFlow(); + + TR::InstOpCode::Mnemonic loadOpCode = TR::InstOpCode::lwz; + TR::InstOpCode::Mnemonic cmpOpCode = TR::InstOpCode::cmpl4; + int32_t loadWidth = 4; + + if (TR::Compiler->target.is64Bit() && !comp->useCompressedPointers()) + { + loadOpCode = TR::InstOpCode::ld; + cmpOpCode = TR::InstOpCode::cmpl8; + loadWidth = 8; + } + + generateTrg1MemInstruction(cg, loadOpCode, node, tmpReg, new (cg->trHeapMemory()) TR::MemoryReference(objReg, offsetReg, loadWidth, cg)); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, startReadBarrierLabel); + + generateTrg1MemInstruction(cg, loadOpCode, node, evacuateReg, + new (cg->trHeapMemory()) TR::MemoryReference(metaReg, cg->comp()->fej9()->thisThreadGetEvacuateBaseAddressOffset(), loadWidth, cg)); + generateTrg1Src2Instruction(cg, cmpOpCode, node, cndReg, tmpReg, evacuateReg); + generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, endReadBarrierLabel, cndReg); + + generateTrg1MemInstruction(cg, loadOpCode, node, evacuateReg, + new (cg->trHeapMemory()) TR::MemoryReference(metaReg, cg->comp()->fej9()->thisThreadGetEvacuateTopAddressOffset(), loadWidth, cg)); + generateTrg1Src2Instruction(cg, cmpOpCode, node, cndReg, tmpReg, evacuateReg); + generateConditionalBranchInstruction(cg, TR::InstOpCode::bgt, node, endReadBarrierLabel, cndReg); + + generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, locationReg, objReg, offsetReg); + + // TR_readBarrier helper expects the vmThread in r3. + generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, node, r3Reg, metaReg); + + TR::SymbolReference *helperSym = comp->getSymRefTab()->findOrCreateRuntimeHelper(TR_readBarrier, false, false, false); + generateDepImmSymInstruction(cg, TR::InstOpCode::bl, node, (uintptrj_t)helperSym->getMethodAddress(), deps, helperSym); + + generateDepLabelInstruction(cg, TR::InstOpCode::label, node, endReadBarrierLabel, deps); + + cg->stopUsingRegister(tmpReg); + cg->stopUsingRegister(locationReg); + cg->stopUsingRegister(evacuateReg); + cg->stopUsingRegister(r11Reg); + cg->stopUsingRegister(r3Reg); + + cg->machine()->setLinkRegisterKilled(true); + } +#endif //OMR_GC_CONCURRENT_SCAVENGER + if (!TR::Options::getCmdLineOptions()->realTimeGC()) resultReg = genCAS(node, cg, objReg, offsetReg, oldVReg, newVReg, cndReg, doneLabel, secondChild, 0, true, (TR::Compiler->target.is64Bit() && !comp->useCompressedPointers())); @@ -10217,7 +10554,10 @@ static TR::Register *inlineConcurrentLinkedQueueTMOffer(TR::Node *node, TR::Code static char * disableTMOffer = feGetEnv("TR_DisableTMOffer"); static char * debugTM = feGetEnv("TR_DebugTM"); - if (disableTMOffer) + /* + * TODO: TM is currently not compatible with read barriers. If read barriers are required, TM is disabled until the issue is fixed. + */ + if (disableTMOffer || TR::Compiler->om.shouldGenerateReadBarriersForFieldLoads()) { generateTrg1ImmInstruction(cg, TR::InstOpCode::li, node, resultReg, 3); // TM offer disabled generateLabelInstruction(cg, TR::InstOpCode::b, node, returnLabel); @@ -10420,7 +10760,10 @@ static TR::Register *inlineConcurrentLinkedQueueTMPoll(TR::Node *node, TR::CodeG static char * disableTMPoll = feGetEnv("TR_DisableTMPoll"); static char * debugTM = feGetEnv("TR_DebugTM"); - if (disableTMPoll) + /* + * TODO: TM is currently not compatible with read barriers. If read barriers are required, TM is disabled until the issue is fixed. + */ + if (disableTMPoll || TR::Compiler->om.shouldGenerateReadBarriersForFieldLoads()) { generateTrg1ImmInstruction(cg, TR::InstOpCode::li, node, resultReg, 0); // BEFORE WAS 0 generateLabelInstruction(cg, TR::InstOpCode::b, node, returnLabel); @@ -11651,6 +11994,585 @@ static TR::Register *inlineIntrinsicIndexOf(TR::Node *node, bool isLatin1, TR::C return result; } +/* + * Arraycopy evaluator needs a version of inlineArrayCopy that can be used inside internal control flow. For this version of inlineArrayCopy, registers must + * be allocated outside of this function so the dependency at the end of the control flow knows about them. + */ +static void inlineArrayCopy_ICF(TR::Node *node, int64_t byteLen, TR::Register *src, TR::Register *dst, TR::CodeGenerator *cg, TR::Register *cndReg, + TR::Register *tmp1Reg, TR::Register *tmp2Reg, TR::Register *tmp3Reg, TR::Register *tmp4Reg, + TR::Register *fp1Reg, TR::Register *fp2Reg, TR::Register *fp3Reg, TR::Register *fp4Reg) + { + if (byteLen == 0) + return; + + TR::Register *regs[4] = {tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg}; + TR::Register *fpRegs[4] = {fp1Reg, fp2Reg, fp3Reg, fp4Reg}; + int32_t groups, residual; + + static bool disableLEArrayCopyInline = (feGetEnv("TR_disableLEArrayCopyInline") != NULL); + TR_Processor processor = TR::Compiler->target.cpu.id(); + bool supportsLEArrayCopyInline = (processor >= TR_PPCp8) && !disableLEArrayCopyInline && TR::Compiler->target.cpu.isLittleEndian() && TR::Compiler->target.cpu.hasFPU() && TR::Compiler->target.is64Bit(); + + if (TR::Compiler->target.is64Bit()) + { + groups = byteLen >> 5; + residual = byteLen & 0x0000001F; + } + else + { + groups = byteLen >> 4; + residual = byteLen & 0x0000000F; + } + + int32_t regIx = 0, ix = 0, fpRegIx = 0; + int32_t memRefSize; + TR::InstOpCode::Mnemonic load, store; + TR::Compilation* comp = cg->comp(); + + /* Some machines have issues with 64bit loads/stores in 32bit mode, ie. sicily, do not check for is64BitProcessor() */ + memRefSize = TR::Compiler->om.sizeofReferenceAddress(); + load = TR::InstOpCode::Op_load; + store = TR::InstOpCode::Op_st; + + TR::LabelSymbol *doneLabel = generateLabelSymbol(cg); + + if (groups != 0) + { + TR::LabelSymbol *loopStart; + + if (groups != 1) + { + if (groups <= UPPER_IMMED) + generateTrg1ImmInstruction(cg, TR::InstOpCode::li, node, regs[0], groups); + else + loadConstant(cg, node, groups, regs[0]); + generateSrc1Instruction(cg, TR::InstOpCode::mtctr, node, regs[0]); + + loopStart = generateLabelSymbol(cg); + generateLabelInstruction(cg, TR::InstOpCode::label, node, loopStart); + } + + if (supportsLEArrayCopyInline) + { + generateTrg1MemInstruction(cg, TR::InstOpCode::lfd, node, fpRegs[3], new (cg->trHeapMemory()) TR::MemoryReference(src, 0, memRefSize, cg)); + generateTrg1MemInstruction(cg, TR::InstOpCode::lfd, node, fpRegs[2], new (cg->trHeapMemory()) TR::MemoryReference(src, memRefSize, memRefSize, cg)); + generateTrg1MemInstruction(cg, TR::InstOpCode::lfd, node, fpRegs[1], new (cg->trHeapMemory()) TR::MemoryReference(src, 2*memRefSize, memRefSize, cg)); + generateTrg1MemInstruction(cg, TR::InstOpCode::lfd, node, fpRegs[0], new (cg->trHeapMemory()) TR::MemoryReference(src, 3*memRefSize, memRefSize, cg)); + } + else + { + generateTrg1MemInstruction(cg, load, node, regs[3], new (cg->trHeapMemory()) TR::MemoryReference(src, 0, memRefSize, cg)); + generateTrg1MemInstruction(cg, load, node, regs[2], new (cg->trHeapMemory()) TR::MemoryReference(src, memRefSize, memRefSize, cg)); + generateTrg1MemInstruction(cg, load, node, regs[1], new (cg->trHeapMemory()) TR::MemoryReference(src, 2*memRefSize, memRefSize, cg)); + generateTrg1MemInstruction(cg, load, node, regs[0], new (cg->trHeapMemory()) TR::MemoryReference(src, 3*memRefSize, memRefSize, cg)); + } + + if (groups != 1) + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, src, src, 4*memRefSize); + + if (supportsLEArrayCopyInline) + { + generateMemSrc1Instruction(cg, TR::InstOpCode::stfd, node, new (cg->trHeapMemory()) TR::MemoryReference(dst, 0, memRefSize, cg), fpRegs[3]); + generateMemSrc1Instruction(cg, TR::InstOpCode::stfd, node, new (cg->trHeapMemory()) TR::MemoryReference(dst, memRefSize, memRefSize, cg), fpRegs[2]); + generateMemSrc1Instruction(cg, TR::InstOpCode::stfd, node, new (cg->trHeapMemory()) TR::MemoryReference(dst, 2*memRefSize, memRefSize, cg), fpRegs[1]); + generateMemSrc1Instruction(cg, TR::InstOpCode::stfd, node, new (cg->trHeapMemory()) TR::MemoryReference(dst, 3*memRefSize, memRefSize, cg), fpRegs[0]); + } + else + { + generateMemSrc1Instruction(cg, store, node, new (cg->trHeapMemory()) TR::MemoryReference(dst, 0, memRefSize, cg), regs[3]); + generateMemSrc1Instruction(cg, store, node, new (cg->trHeapMemory()) TR::MemoryReference(dst, memRefSize, memRefSize, cg), regs[2]); + generateMemSrc1Instruction(cg, store, node, new (cg->trHeapMemory()) TR::MemoryReference(dst, 2*memRefSize, memRefSize, cg), regs[1]); + generateMemSrc1Instruction(cg, store, node, new (cg->trHeapMemory()) TR::MemoryReference(dst, 3*memRefSize, memRefSize, cg), regs[0]); + } + + if (groups != 1) + { + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi2, node, dst, dst, 4*memRefSize); + generateConditionalBranchInstruction(cg, TR::InstOpCode::bdnz, node, loopStart, cndReg); + } + else + { + ix = 4*memRefSize; + } + } + + for (; residual>=memRefSize; residual-=memRefSize, ix+=memRefSize) + { + if (supportsLEArrayCopyInline) + { + TR::Register *oneReg = fpRegs[fpRegIx++]; + if (fpRegIx>3 || fpRegs[fpRegIx]==NULL) + fpRegIx = 0; + generateTrg1MemInstruction(cg, TR::InstOpCode::lfd, node, oneReg, new (cg->trHeapMemory()) TR::MemoryReference(src, ix, memRefSize, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stfd, node, new (cg->trHeapMemory()) TR::MemoryReference(dst, ix, memRefSize, cg), oneReg); + } + else + { + TR::Register *oneReg = regs[regIx++]; + if (regIx>3 || regs[regIx]==NULL) + regIx = 0; + generateTrg1MemInstruction(cg, load, node, oneReg, new (cg->trHeapMemory()) TR::MemoryReference(src, ix, memRefSize, cg)); + generateMemSrc1Instruction(cg, store, node, new (cg->trHeapMemory()) TR::MemoryReference(dst, ix, memRefSize, cg), oneReg); + } + } + + if (residual != 0) + { + if (residual & 4) + { + if (supportsLEArrayCopyInline) + { + TR::Register *oneReg = fpRegs[fpRegIx++]; + if (fpRegIx>3 || fpRegs[fpRegIx]==NULL) + fpRegIx = 0; + generateTrg1MemInstruction(cg, TR::InstOpCode::lfs, node, oneReg, new (cg->trHeapMemory()) TR::MemoryReference(src, ix, 4, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stfs, node, new (cg->trHeapMemory()) TR::MemoryReference(dst, ix, 4, cg), oneReg); + } + else + { + TR::Register *oneReg = regs[regIx++]; + if (regIx>3 || regs[regIx]==NULL) + regIx = 0; + generateTrg1MemInstruction(cg, TR::InstOpCode::lwz, node, oneReg, new (cg->trHeapMemory()) TR::MemoryReference(src, ix, 4, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stw, node, new (cg->trHeapMemory()) TR::MemoryReference(dst, ix, 4, cg), oneReg); + } + ix += 4; + } + if (residual & 2) + { + TR::Register *oneReg = regs[regIx++]; + if (regIx>3 || regs[regIx]==NULL) + regIx = 0; + generateTrg1MemInstruction(cg, TR::InstOpCode::lhz, node, oneReg, new (cg->trHeapMemory()) TR::MemoryReference(src, ix, 2, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::sth, node, new (cg->trHeapMemory()) TR::MemoryReference(dst, ix, 2, cg), oneReg); + ix += 2; + } + if (residual & 1) + { + generateTrg1MemInstruction(cg, TR::InstOpCode::lbz, node, regs[regIx], new (cg->trHeapMemory()) TR::MemoryReference(src, ix, 1, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stb, node, new (cg->trHeapMemory()) TR::MemoryReference(dst, ix, 1, cg), regs[regIx]); + } + } + + return; + } + +static void inlineVSXArrayCopy(TR::Node *node, TR::Register *srcAddrReg, TR::Register *dstAddrReg, TR::Register *lengthReg, TR::Register *cndRegister, + TR::Register *tmp1Reg, TR::Register *tmp2Reg, TR::Register *tmp3Reg, TR::Register *tmp4Reg, TR::Register *fp1Reg, TR::Register *fp2Reg, + TR::Register *vec0Reg, TR::Register *vec1Reg, bool supportsLEArrayCopy, TR::RegisterDependencyConditions *conditions, TR::CodeGenerator *cg) + { + TR::LabelSymbol *forwardQuadWordArrayCopy_vsx = NULL; + TR::LabelSymbol *quadWordArrayCopy_vsx = NULL; + TR::LabelSymbol *quadWordAlignment_vsx = NULL; + TR::LabelSymbol *reverseQuadWordAlignment_vsx = NULL; + TR::LabelSymbol *quadWordAlignment_vsx_test8 = generateLabelSymbol(cg); + TR::LabelSymbol *quadWordAlignment_vsx_test4 = generateLabelSymbol(cg); + TR::LabelSymbol *quadWordAlignment_vsx_test2 = generateLabelSymbol(cg); + TR::LabelSymbol *quadWordAligned_vsx = generateLabelSymbol(cg); + TR::LabelSymbol *quadWordAligned_vsx_residue32 = generateLabelSymbol(cg); + TR::LabelSymbol *quadWordAligned_vsx_residue16 = generateLabelSymbol(cg); + TR::LabelSymbol *quadWordAligned_vsx_residue8 = generateLabelSymbol(cg); + TR::LabelSymbol *quadWordAligned_vsx_residue4 = generateLabelSymbol(cg); + TR::LabelSymbol *quadWordAligned_vsx_residue2 = generateLabelSymbol(cg); + TR::LabelSymbol *quadWordAligned_vsx_residue1 = generateLabelSymbol(cg); + TR::LabelSymbol *quadWordLoop_vsx = generateLabelSymbol(cg); + TR::LabelSymbol *reverseQuadWordAlignment_vsx_test8 = NULL; + TR::LabelSymbol *reverseQuadWordAlignment_vsx_test4 = NULL; + TR::LabelSymbol *reverseQuadWordAlignment_vsx_test2 = NULL; + TR::LabelSymbol *reverseQuadWordAligned_vsx = NULL; + TR::LabelSymbol *reverseQuadWordAligned_vsx_residue32 = NULL; + TR::LabelSymbol *reverseQuadWordAligned_vsx_residue16 = NULL; + TR::LabelSymbol *reverseQuadWordAligned_vsx_residue8 = NULL; + TR::LabelSymbol *reverseQuadWordAligned_vsx_residue4 = NULL; + TR::LabelSymbol *reverseQuadWordAligned_vsx_residue2 = NULL; + TR::LabelSymbol *reverseQuadWordAligned_vsx_residue1 = NULL; + TR::LabelSymbol *reverseQuadWordLoop_vsx = NULL; + TR::LabelSymbol *doneLabel = generateLabelSymbol(cg); + + generateMemInstruction(cg, TR::InstOpCode::dcbt, node, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, 0, 16, cg)); + generateMemInstruction(cg, TR::InstOpCode::dcbtst, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, 0, 16, cg)); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpli, node, cndRegister, lengthReg, 0); + generateConditionalBranchInstruction(cg, TR::InstOpCode::ble, node, doneLabel, cndRegister); + + if (!node->isForwardArrayCopy()) + { + reverseQuadWordAlignment_vsx = generateLabelSymbol(cg); + generateTrg1Src2Instruction(cg, TR::InstOpCode::subf, node, tmp4Reg, srcAddrReg, dstAddrReg); + generateTrg1Src2Instruction(cg, TR::Compiler->target.is64Bit() ? TR::InstOpCode::cmpl8 : TR::InstOpCode::cmpl4, node, cndRegister, tmp4Reg, lengthReg); + generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, reverseQuadWordAlignment_vsx, cndRegister); + } + + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpli, node, cndRegister, lengthReg, 16); + generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, quadWordAligned_vsx_residue8, cndRegister); + + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, tmp4Reg, srcAddrReg, 7); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, 1, node, quadWordAlignment_vsx_test8, cndRegister); + + //check 2 aligned + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, tmp4Reg, srcAddrReg, 1); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, quadWordAlignment_vsx_test2, cndRegister); + generateTrg1MemInstruction(cg, TR::InstOpCode::lbz, node, tmp2Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, 0, 1, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stb, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, 0, 1, cg), tmp2Reg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, srcAddrReg, srcAddrReg, 1); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 1); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, lengthReg, lengthReg, -1); + + //check 4 aligned + generateLabelInstruction(cg, TR::InstOpCode::label, node, quadWordAlignment_vsx_test2); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, tmp4Reg, srcAddrReg, 2); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, quadWordAlignment_vsx_test4, cndRegister); + generateTrg1MemInstruction(cg, TR::InstOpCode::lhz, node, tmp2Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, 0, 2, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::sth, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, 0, 2, cg), tmp2Reg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, srcAddrReg, srcAddrReg, 2); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 2); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, lengthReg, lengthReg, -2); + + //check 8 aligned + generateLabelInstruction(cg, TR::InstOpCode::label, node, quadWordAlignment_vsx_test4); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, tmp4Reg, srcAddrReg, 4); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, quadWordAlignment_vsx_test8, cndRegister); + if (supportsLEArrayCopy) + { + generateTrg1MemInstruction(cg, TR::InstOpCode::lfs, node, fp1Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, 0, 4, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stfs, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, 0, 4, cg), fp1Reg); + } + else + { + generateTrg1MemInstruction(cg, TR::InstOpCode::lwz, node, tmp2Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, 0, 4, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stw, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, 0, 4, cg), tmp2Reg); + } + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, srcAddrReg, srcAddrReg, 4); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 4); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, lengthReg, lengthReg, -4); + + //check 16 aligned + generateLabelInstruction(cg, TR::InstOpCode::label, node, quadWordAlignment_vsx_test8); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, tmp4Reg, srcAddrReg, 8); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, quadWordAligned_vsx, cndRegister); + if (supportsLEArrayCopy) + { + generateTrg1MemInstruction(cg, TR::InstOpCode::lfd, node, fp2Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, 0, 8, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stfd, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, 0, 8, cg), fp2Reg); + } + else if (TR::Compiler->target.is64Bit()) + { + generateTrg1MemInstruction(cg, TR::InstOpCode::ld, node, tmp2Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, 0, 8, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, 0, 8, cg), tmp2Reg); + } + else + { + generateTrg1MemInstruction(cg, TR::InstOpCode::lwz, node, tmp1Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, 0, 4, cg)); + generateTrg1MemInstruction(cg, TR::InstOpCode::lwz, node, tmp2Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, 4, 4, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stw, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, 0, 4, cg), tmp1Reg); + generateMemSrc1Instruction(cg, TR::InstOpCode::stw, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, 4, 4, cg), tmp2Reg); + } + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, srcAddrReg, srcAddrReg, 8); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 8); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, lengthReg, lengthReg, -8); + + //16 aligned + generateLabelInstruction(cg, TR::InstOpCode::label, node, quadWordAligned_vsx); + generateTrg1ImmInstruction(cg, TR::InstOpCode::li, node, tmp2Reg, 16); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::srawi, node, tmp4Reg, lengthReg, 6); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, lengthReg, lengthReg, 63); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpli, node, cndRegister, tmp4Reg, 0); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, quadWordAligned_vsx_residue32, cndRegister); + generateSrc1Instruction(cg, TR::InstOpCode::mtctr, node, tmp4Reg); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, quadWordLoop_vsx); + generateTrg1MemInstruction(cg, TR::InstOpCode::lxvw4x, node, vec0Reg, new (cg->trHeapMemory()) TR::MemoryReference(NULL, srcAddrReg, 16, cg)); + generateTrg1MemInstruction(cg, TR::InstOpCode::lxvw4x, node, vec1Reg, new (cg->trHeapMemory()) TR::MemoryReference(tmp2Reg, srcAddrReg, 16, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stxvw4x, node, new (cg->trHeapMemory()) TR::MemoryReference(NULL, dstAddrReg, 16, cg), vec0Reg); + generateMemSrc1Instruction(cg, TR::InstOpCode::stxvw4x, node, new (cg->trHeapMemory()) TR::MemoryReference(tmp2Reg, dstAddrReg, 16, cg), vec1Reg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, srcAddrReg, srcAddrReg, 32); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 32); + generateTrg1MemInstruction(cg, TR::InstOpCode::lxvw4x, node, vec0Reg, new (cg->trHeapMemory()) TR::MemoryReference(NULL, srcAddrReg, 16, cg)); + generateTrg1MemInstruction(cg, TR::InstOpCode::lxvw4x, node, vec1Reg, new (cg->trHeapMemory()) TR::MemoryReference(tmp2Reg, srcAddrReg, 16, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stxvw4x, node, new (cg->trHeapMemory()) TR::MemoryReference(NULL, dstAddrReg, 16, cg), vec0Reg); + generateMemSrc1Instruction(cg, TR::InstOpCode::stxvw4x, node, new (cg->trHeapMemory()) TR::MemoryReference(tmp2Reg, dstAddrReg, 16, cg), vec1Reg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, srcAddrReg, srcAddrReg, 32); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 32); + generateConditionalBranchInstruction(cg, TR::InstOpCode::bdnz, node, quadWordLoop_vsx, cndRegister); + + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpli, node, cndRegister, lengthReg, 0); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, doneLabel, cndRegister); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, quadWordAligned_vsx_residue32); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpli, node, cndRegister, lengthReg, 32); + generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, quadWordAligned_vsx_residue16, cndRegister); + generateTrg1MemInstruction(cg, TR::InstOpCode::lxvw4x, node, vec0Reg, new (cg->trHeapMemory()) TR::MemoryReference(NULL, srcAddrReg, 16, cg)); + generateTrg1MemInstruction(cg, TR::InstOpCode::lxvw4x, node, vec1Reg, new (cg->trHeapMemory()) TR::MemoryReference(tmp2Reg, srcAddrReg, 16, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stxvw4x, node, new (cg->trHeapMemory()) TR::MemoryReference(NULL, dstAddrReg, 16, cg), vec0Reg); + generateMemSrc1Instruction(cg, TR::InstOpCode::stxvw4x, node, new (cg->trHeapMemory()) TR::MemoryReference(tmp2Reg, dstAddrReg, 16, cg), vec1Reg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, srcAddrReg, srcAddrReg, 32); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 32); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, lengthReg, lengthReg, -32); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, quadWordAligned_vsx_residue16); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpli, node, cndRegister, lengthReg, 16); + generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, quadWordAligned_vsx_residue8, cndRegister); + generateTrg1MemInstruction(cg, TR::InstOpCode::lxvw4x, node, vec1Reg, new (cg->trHeapMemory()) TR::MemoryReference(NULL, srcAddrReg, 16, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stxvw4x, node, new (cg->trHeapMemory()) TR::MemoryReference(NULL, dstAddrReg, 16, cg), vec1Reg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, srcAddrReg, srcAddrReg, 16); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 16); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, lengthReg, lengthReg, -16); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, quadWordAligned_vsx_residue8); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpli, node, cndRegister, lengthReg, 8); + generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, quadWordAligned_vsx_residue4, cndRegister); + if (supportsLEArrayCopy) + { + generateTrg1MemInstruction(cg, TR::InstOpCode::lfd, node, fp2Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, 0, 8, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stfd, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, 0, 8, cg), fp2Reg); + } + else if (TR::Compiler->target.is64Bit()) + { + generateTrg1MemInstruction(cg, TR::InstOpCode::ld, node, tmp2Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, 0, 8, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, 0, 8, cg), tmp2Reg); + } + else + { + generateTrg1MemInstruction(cg, TR::InstOpCode::lwz, node, tmp1Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, 0, 4, cg)); + generateTrg1MemInstruction(cg, TR::InstOpCode::lwz, node, tmp2Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, 4, 4, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stw, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, 0, 4, cg), tmp1Reg); + generateMemSrc1Instruction(cg, TR::InstOpCode::stw, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, 4, 4, cg), tmp2Reg); + } + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, srcAddrReg, srcAddrReg, 8); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 8); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, lengthReg, lengthReg, -8); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, quadWordAligned_vsx_residue4); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpli, node, cndRegister, lengthReg, 4); + generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, quadWordAligned_vsx_residue2, cndRegister); + if (supportsLEArrayCopy) + { + generateTrg1MemInstruction(cg, TR::InstOpCode::lfs, node, fp1Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, 0, 4, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stfs, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, 0, 4, cg), fp1Reg); + } + else + { + generateTrg1MemInstruction(cg, TR::InstOpCode::lwz, node, tmp2Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, 0, 4, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stw, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, 0, 4, cg), tmp2Reg); + } + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, srcAddrReg, srcAddrReg, 4); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 4); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, lengthReg, lengthReg, -4); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, quadWordAligned_vsx_residue2); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpli, node, cndRegister, lengthReg, 2); + generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, quadWordAligned_vsx_residue1, cndRegister); + generateTrg1MemInstruction(cg, TR::InstOpCode::lhz, node, tmp2Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, 0, 2, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::sth, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, 0, 2, cg), tmp2Reg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, srcAddrReg, srcAddrReg, 2); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 2); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, lengthReg, lengthReg, -2); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, quadWordAligned_vsx_residue1); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpli, node, cndRegister, lengthReg, 1); + generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, doneLabel, cndRegister); + generateTrg1MemInstruction(cg, TR::InstOpCode::lbz, node, tmp2Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, 0, 1, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stb, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, 0, 1, cg), tmp2Reg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, srcAddrReg, srcAddrReg, 1); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 1); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, lengthReg, lengthReg, -1); + if(!node->isForwardArrayCopy()) + generateLabelInstruction(cg, TR::InstOpCode::b, node, doneLabel); + + if(!node->isForwardArrayCopy()) + { + reverseQuadWordAlignment_vsx_test8 = generateLabelSymbol(cg); + reverseQuadWordAlignment_vsx_test4 = generateLabelSymbol(cg); + reverseQuadWordAlignment_vsx_test2 = generateLabelSymbol(cg); + reverseQuadWordAligned_vsx = generateLabelSymbol(cg); + reverseQuadWordAligned_vsx_residue32 = generateLabelSymbol(cg); + reverseQuadWordAligned_vsx_residue16 = generateLabelSymbol(cg); + reverseQuadWordAligned_vsx_residue8 = generateLabelSymbol(cg); + reverseQuadWordAligned_vsx_residue4 = generateLabelSymbol(cg); + reverseQuadWordAligned_vsx_residue2 = generateLabelSymbol(cg); + reverseQuadWordAligned_vsx_residue1 = generateLabelSymbol(cg); + reverseQuadWordLoop_vsx = generateLabelSymbol(cg); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, reverseQuadWordAlignment_vsx); + generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, srcAddrReg, srcAddrReg, lengthReg); + generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, dstAddrReg, dstAddrReg, lengthReg); + + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpli, node, cndRegister, lengthReg, 16); + generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, reverseQuadWordAligned_vsx_residue8, cndRegister); + + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, tmp4Reg, srcAddrReg, 7); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, reverseQuadWordAlignment_vsx_test8, cndRegister); + + //check 2 aligned + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, tmp4Reg, srcAddrReg, 1); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, reverseQuadWordAlignment_vsx_test2, cndRegister); + generateTrg1MemInstruction(cg, TR::InstOpCode::lbz, node, tmp2Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, -1, 1, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stb, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, -1, 1, cg), tmp2Reg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, srcAddrReg, srcAddrReg, -1); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, -1); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, lengthReg, lengthReg, -1); + + //check 4 aligned + generateLabelInstruction(cg, TR::InstOpCode::label, node, reverseQuadWordAlignment_vsx_test2); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, tmp4Reg, srcAddrReg, 2); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, reverseQuadWordAlignment_vsx_test4, cndRegister); + generateTrg1MemInstruction(cg, TR::InstOpCode::lhz, node, tmp2Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, -2, 2, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::sth, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, -2, 2, cg), tmp2Reg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, srcAddrReg, srcAddrReg, -2); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, -2); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, lengthReg, lengthReg, -2); + + //check 8 aligned + generateLabelInstruction(cg, TR::InstOpCode::label, node, reverseQuadWordAlignment_vsx_test4); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, tmp4Reg, srcAddrReg, 4); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, reverseQuadWordAlignment_vsx_test8, cndRegister); + if (supportsLEArrayCopy) + { + generateTrg1MemInstruction(cg, TR::InstOpCode::lfs, node, fp1Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, -4, 4, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stfs, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, -4, 4, cg), fp1Reg); + } + else + { + generateTrg1MemInstruction(cg, TR::InstOpCode::lwz, node, tmp2Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, -4, 4, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stw, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, -4, 4, cg), tmp2Reg); + } + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, srcAddrReg, srcAddrReg, -4); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, -4); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, lengthReg, lengthReg, -4); + + //check 16 aligned + generateLabelInstruction(cg, TR::InstOpCode::label, node, reverseQuadWordAlignment_vsx_test8); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, tmp4Reg, srcAddrReg, 8); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, reverseQuadWordAligned_vsx, cndRegister); + if (supportsLEArrayCopy) + { + generateTrg1MemInstruction(cg, TR::InstOpCode::lfd, node, fp2Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, -8, 8, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stfd, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, -8, 8, cg), fp2Reg); + } + else if (TR::Compiler->target.is64Bit()) + { + generateTrg1MemInstruction(cg, TR::InstOpCode::ld, node, tmp2Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, -8, 8, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, -8, 8, cg), tmp2Reg); + } + else + { + generateTrg1MemInstruction(cg, TR::InstOpCode::lwz, node, tmp1Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, -4, 4, cg)); + generateTrg1MemInstruction(cg, TR::InstOpCode::lwz, node, tmp2Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, -8, 4, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stw, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, -4, 4, cg), tmp1Reg); + generateMemSrc1Instruction(cg, TR::InstOpCode::stw, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, -8, 4, cg), tmp2Reg); + } + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, srcAddrReg, srcAddrReg, -8); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, -8); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, lengthReg, lengthReg, -8); + + //16 aligned + generateLabelInstruction(cg, TR::InstOpCode::label, node, reverseQuadWordAligned_vsx); + generateTrg1ImmInstruction(cg, TR::InstOpCode::li, node, tmp2Reg, -16); + generateTrg1ImmInstruction(cg, TR::InstOpCode::li, node, tmp1Reg, -32); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::srawi, node, tmp4Reg, lengthReg, 6); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, lengthReg, lengthReg, 63); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpli, node, cndRegister, tmp4Reg, 0); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, reverseQuadWordAligned_vsx_residue32, cndRegister); + generateSrc1Instruction(cg, TR::InstOpCode::mtctr, node, tmp4Reg); + generateLabelInstruction(cg, TR::InstOpCode::label, node, reverseQuadWordLoop_vsx); + generateTrg1MemInstruction(cg, TR::InstOpCode::lxvw4x, node, vec0Reg, new (cg->trHeapMemory()) TR::MemoryReference(tmp2Reg, srcAddrReg, 16, cg)); + generateTrg1MemInstruction(cg, TR::InstOpCode::lxvw4x, node, vec1Reg, new (cg->trHeapMemory()) TR::MemoryReference(tmp1Reg, srcAddrReg, 16, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stxvw4x, node, new (cg->trHeapMemory()) TR::MemoryReference(tmp2Reg, dstAddrReg, 16, cg), vec0Reg); + generateMemSrc1Instruction(cg, TR::InstOpCode::stxvw4x, node, new (cg->trHeapMemory()) TR::MemoryReference(tmp1Reg, dstAddrReg, 16, cg), vec1Reg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, srcAddrReg, srcAddrReg, -32); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, -32); + generateTrg1MemInstruction(cg, TR::InstOpCode::lxvw4x, node, vec0Reg, new (cg->trHeapMemory()) TR::MemoryReference(tmp2Reg, srcAddrReg, 16, cg)); + generateTrg1MemInstruction(cg, TR::InstOpCode::lxvw4x, node, vec1Reg, new (cg->trHeapMemory()) TR::MemoryReference(tmp1Reg, srcAddrReg, 16, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stxvw4x, node, new (cg->trHeapMemory()) TR::MemoryReference(tmp2Reg, dstAddrReg, 16, cg), vec0Reg); + generateMemSrc1Instruction(cg, TR::InstOpCode::stxvw4x, node, new (cg->trHeapMemory()) TR::MemoryReference(tmp1Reg, dstAddrReg, 16, cg), vec1Reg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, srcAddrReg, srcAddrReg, -32); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, -32); + generateConditionalBranchInstruction(cg, TR::InstOpCode::bdnz, node, reverseQuadWordLoop_vsx, cndRegister); + + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpli, node, cndRegister, lengthReg, 0); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, doneLabel, cndRegister); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, reverseQuadWordAligned_vsx_residue32); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpli, node, cndRegister, lengthReg, 32); + generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, reverseQuadWordAligned_vsx_residue16, cndRegister); + generateTrg1MemInstruction(cg, TR::InstOpCode::lxvw4x, node, vec0Reg, new (cg->trHeapMemory()) TR::MemoryReference(tmp2Reg, srcAddrReg, 16, cg)); + generateTrg1MemInstruction(cg, TR::InstOpCode::lxvw4x, node, vec1Reg, new (cg->trHeapMemory()) TR::MemoryReference(tmp1Reg, srcAddrReg, 16, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stxvw4x, node, new (cg->trHeapMemory()) TR::MemoryReference(tmp2Reg, dstAddrReg, 16, cg), vec0Reg); + generateMemSrc1Instruction(cg, TR::InstOpCode::stxvw4x, node, new (cg->trHeapMemory()) TR::MemoryReference(tmp1Reg, dstAddrReg, 16, cg), vec1Reg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, srcAddrReg, srcAddrReg, -32); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, -32); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, lengthReg, lengthReg, -32); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, reverseQuadWordAligned_vsx_residue16); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpli, node, cndRegister, lengthReg, 16); + generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, reverseQuadWordAligned_vsx_residue8, cndRegister); + generateTrg1MemInstruction(cg, TR::InstOpCode::lxvw4x, node, vec1Reg, new (cg->trHeapMemory()) TR::MemoryReference(tmp2Reg, srcAddrReg, 16, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stxvw4x, node, new (cg->trHeapMemory()) TR::MemoryReference(tmp2Reg, dstAddrReg, 16, cg), vec1Reg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, srcAddrReg, srcAddrReg, -16); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, -16); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, lengthReg, lengthReg, -16); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, reverseQuadWordAligned_vsx_residue8); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpli, node, cndRegister, lengthReg, 8); + generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, reverseQuadWordAligned_vsx_residue4, cndRegister); + if (supportsLEArrayCopy) + { + generateTrg1MemInstruction(cg, TR::InstOpCode::lfd, node, fp2Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, -8, 8, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stfd, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, -8, 8, cg), fp2Reg); + } + else if (TR::Compiler->target.is64Bit()) + { + generateTrg1MemInstruction(cg, TR::InstOpCode::ld, node, tmp2Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, -8, 8, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, -8, 8, cg), tmp2Reg); + } + else + { + generateTrg1MemInstruction(cg, TR::InstOpCode::lwz, node, tmp1Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, -4, 4, cg)); + generateTrg1MemInstruction(cg, TR::InstOpCode::lwz, node, tmp2Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, -8, 4, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stw, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, -4, 4, cg), tmp1Reg); + generateMemSrc1Instruction(cg, TR::InstOpCode::stw, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, -8, 4, cg), tmp2Reg); + } + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, srcAddrReg, srcAddrReg, -8); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, -8); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, lengthReg, lengthReg, -8); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, reverseQuadWordAligned_vsx_residue4); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpli, node, cndRegister, lengthReg, 4); + generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, reverseQuadWordAligned_vsx_residue2, cndRegister); + if (supportsLEArrayCopy) + { + generateTrg1MemInstruction(cg, TR::InstOpCode::lfs, node, fp1Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, -4, 4, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stfs, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, -4, 4, cg), fp1Reg); + } + else + { + generateTrg1MemInstruction(cg, TR::InstOpCode::lwz, node, tmp2Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, -4, 4, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stw, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, -4, 4, cg), tmp2Reg); + } + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, srcAddrReg, srcAddrReg, -4); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, -4); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, lengthReg, lengthReg, -4); + generateLabelInstruction(cg, TR::InstOpCode::label, node, reverseQuadWordAligned_vsx_residue2); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpli, node, cndRegister, lengthReg, 2); + generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, reverseQuadWordAligned_vsx_residue1, cndRegister); + generateTrg1MemInstruction(cg, TR::InstOpCode::lhz, node, tmp2Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, -2, 2, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::sth, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, -2, 2, cg), tmp2Reg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, srcAddrReg, srcAddrReg, -2); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, -2); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, lengthReg, lengthReg, -2); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, reverseQuadWordAligned_vsx_residue1); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpli, node, cndRegister, lengthReg, 1); + generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, doneLabel, cndRegister); + generateTrg1MemInstruction(cg, TR::InstOpCode::lbz, node, tmp2Reg, new (cg->trHeapMemory()) TR::MemoryReference(srcAddrReg, -1, 1, cg)); + generateMemSrc1Instruction(cg, TR::InstOpCode::stb, node, new (cg->trHeapMemory()) TR::MemoryReference(dstAddrReg, -1, 1, cg), tmp2Reg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, srcAddrReg, srcAddrReg, -1); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, -1); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, lengthReg, lengthReg, -1); + } + generateDepLabelInstruction(cg, TR::InstOpCode::label, node, doneLabel, conditions); + return; + } + extern TR::Register *inlineLongRotateLeft(TR::Node *node, TR::CodeGenerator *cg); extern TR::Register *inlineIntegerRotateLeft(TR::Node *node, TR::CodeGenerator *cg); extern TR::Register *inlineBigDecimalConstructor(TR::Node *node, TR::CodeGenerator *cg, bool isLong, bool exp); @@ -12693,6 +13615,588 @@ TR::Register *J9::Power::TreeEvaluator::tabortEvaluator(TR::Node *node, TR::Code return NULL; } +TR::Register *J9::Power::TreeEvaluator::arraycopyEvaluator(TR::Node *node, TR::CodeGenerator *cg) + { +#ifdef OMR_GC_CONCURRENT_SCAVENGER + /* + * This version of arraycopyEvaluator is designed to handle the special case where read barriers are + * needed for field loads. At the time of writing, read barriers are used for Concurrent Scavenge GC. + * If there are no read barriers then the original implementation of arraycopyEvaluator can be used. + */ + if (!TR::Compiler->om.shouldGenerateReadBarriersForFieldLoads() || + !node->chkNoArrayStoreCheckArrayCopy() || + !node->isReferenceArrayCopy() || + debug("noArrayCopy") + ) + { + return OMR::TreeEvaluatorConnector::arraycopyEvaluator(node, cg); + } + + TR::Compilation *comp = cg->comp(); + TR::Instruction *gcPoint; + TR::Node *srcObjNode, *dstObjNode, *srcAddrNode, *dstAddrNode, *lengthNode; + TR::Register *srcObjReg, *dstObjReg, *srcAddrReg, *dstAddrReg, *lengthReg; + bool stopUsingCopyReg1, stopUsingCopyReg2, stopUsingCopyReg3, stopUsingCopyReg4, stopUsingCopyReg5 = false; + + srcObjNode = node->getChild(0); + dstObjNode = node->getChild(1); + srcAddrNode = node->getChild(2); + dstAddrNode = node->getChild(3); + lengthNode = node->getChild(4); + + // These calls evaluate the nodes and give back registers that can be clobbered if needed. + stopUsingCopyReg1 = TR::TreeEvaluator::stopUsingCopyReg(srcObjNode, srcObjReg, cg); + stopUsingCopyReg2 = TR::TreeEvaluator::stopUsingCopyReg(dstObjNode, dstObjReg, cg); + stopUsingCopyReg3 = TR::TreeEvaluator::stopUsingCopyReg(srcAddrNode, srcAddrReg, cg); + stopUsingCopyReg4 = TR::TreeEvaluator::stopUsingCopyReg(dstAddrNode, dstAddrReg, cg); + + lengthReg = cg->evaluate(lengthNode); + if (!cg->canClobberNodesRegister(lengthNode)) + { + TR::Register *lenCopyReg = cg->allocateRegister(); + generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, lengthNode, lenCopyReg, lengthReg); + lengthReg = lenCopyReg; + stopUsingCopyReg5 = true; + } + + // Inline forward arrayCopy with constant length. + int64_t len = -1; + if (node->isForwardArrayCopy() && lengthNode->getOpCode().isLoadConst()) + { + len = (lengthNode->getType().isInt32() ? lengthNode->getInt() : lengthNode->getLongInt()); + if (len >= 0) + { + /* + * This path generates code to perform a runtime check on whether concurrent GC is done moving objects or not. + * If it isn't, a call to referenceArrayCopy helper should be made. + * If it is, using the inlined array copy code path is okay. + */ + int32_t groups; + + static bool disableLEArrayCopyInline = (feGetEnv("TR_disableLEArrayCopyInline") != NULL); + TR_Processor processor = TR::Compiler->target.cpu.id(); + bool supportsLEArrayCopyInline = (processor >= TR_PPCp8) && + !disableLEArrayCopyInline && + TR::Compiler->target.cpu.isLittleEndian() && + TR::Compiler->target.cpu.hasFPU() && + TR::Compiler->target.is64Bit(); + + // There are a minimum of 9 dependencies. + uint8_t numDeps = 9; + + if (TR::Compiler->target.is64Bit()) + { + groups = len >> 5; + } + else + { + groups = len >> 4; + } + + TR::Register *condReg = cg->allocateRegister(TR_CCR); + + // These registers are used when taking the inlineArrayCopy_ICF path. + TR::Register *tmp1Reg = cg->allocateRegister(TR_GPR); + TR::Register *tmp2Reg = NULL; + TR::Register *tmp3Reg = NULL; + TR::Register *tmp4Reg = NULL; + TR::Register *fp1Reg = NULL; + TR::Register *fp2Reg = NULL; + TR::Register *fp3Reg = NULL; + TR::Register *fp4Reg = NULL; + + // These registers are used when taking the referenceArrayCopy helper call path. + TR::Register *r3Reg = cg->allocateRegister(); + TR::Register *metaReg = cg->getMethodMetaDataRegister(); + + if (groups != 0) + { + numDeps += 3; + tmp2Reg = cg->allocateRegister(TR_GPR); + tmp3Reg = cg->allocateRegister(TR_GPR); + tmp4Reg = cg->allocateRegister(TR_GPR); + } + + if (supportsLEArrayCopyInline) + { + numDeps += 4; + fp1Reg = cg->allocateRegister(TR_FPR); + fp2Reg = cg->allocateRegister(TR_FPR); + fp3Reg = cg->allocateRegister(TR_FPR); + fp4Reg = cg->allocateRegister(TR_FPR); + } + + /* + * r3-r8 are used to pass parameters to the referenceArrayCopy helper. + * r11 is used for the tmp1Reg since r11 gets killed by the trampoline and values put into tmp1Reg are not needed after the trampoline. + */ + TR::RegisterDependencyConditions *deps = new (cg->trHeapMemory()) TR::RegisterDependencyConditions(numDeps, numDeps, cg->trMemory()); + addDependency(deps, condReg, TR::RealRegister::cr0, TR_CCR, cg); + addDependency(deps, metaReg, TR::RealRegister::NoReg, TR_GPR, cg); + + addDependency(deps, r3Reg, TR::RealRegister::gr3, TR_GPR, cg); + addDependency(deps, srcObjReg, TR::RealRegister::gr4, TR_GPR, cg); + addDependency(deps, dstObjReg, TR::RealRegister::gr5, TR_GPR, cg); + addDependency(deps, srcAddrReg, TR::RealRegister::gr6, TR_GPR, cg); + addDependency(deps, dstAddrReg, TR::RealRegister::gr7, TR_GPR, cg); + addDependency(deps, lengthReg, TR::RealRegister::gr8, TR_GPR, cg); + + addDependency(deps, tmp1Reg, TR::RealRegister::gr11, TR_GPR, cg); + + if (groups != 0) + { + addDependency(deps, tmp2Reg, TR::RealRegister::NoReg, TR_GPR, cg); + addDependency(deps, tmp3Reg, TR::RealRegister::NoReg, TR_GPR, cg); + addDependency(deps, tmp4Reg, TR::RealRegister::NoReg, TR_GPR, cg); + } + + if (supportsLEArrayCopyInline) + { + addDependency(deps, fp1Reg, TR::RealRegister::NoReg, TR_FPR, cg); + addDependency(deps, fp2Reg, TR::RealRegister::NoReg, TR_FPR, cg); + addDependency(deps, fp3Reg, TR::RealRegister::NoReg, TR_FPR, cg); + addDependency(deps, fp4Reg, TR::RealRegister::NoReg, TR_FPR, cg); + } + + TR::LabelSymbol *startLabel = generateLabelSymbol(cg); + TR::LabelSymbol *helperLabel = generateLabelSymbol(cg); + TR::LabelSymbol *endLabel = generateLabelSymbol(cg); + startLabel->setStartInternalControlFlow(); + endLabel->setEndInternalControlFlow(); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, startLabel); + + // Runtime check for concurrent scavenger. + generateTrg1MemInstruction(cg, TR::InstOpCode::Op_load, node, tmp1Reg, + new (cg->trHeapMemory()) TR::MemoryReference(metaReg, offsetof(J9VMThread, readBarrierRangeCheckTop), TR::Compiler->om.sizeofReferenceAddress(), cg)); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpli, node, condReg, tmp1Reg, 0); + generateConditionalBranchInstruction(cg, TR::InstOpCode::bne, node, helperLabel, condReg); + + // Generate assembly for inlined version of array copy. + inlineArrayCopy_ICF(node, len, srcAddrReg, dstAddrReg, cg, condReg, + tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg, + fp1Reg, fp2Reg, fp3Reg, fp4Reg); + + generateLabelInstruction(cg, TR::InstOpCode::b, node, endLabel); + + // Start of referenceArrayCopy helper path. + generateLabelInstruction(cg, TR::InstOpCode::label, node, helperLabel); + + TR::PPCJNILinkage *jniLinkage = (TR::PPCJNILinkage*) cg->getLinkage(TR_J9JNILinkage); + const TR::PPCLinkageProperties &pp = jniLinkage->getProperties(); + + int32_t elementSize; + if (comp->useCompressedPointers()) + elementSize = TR::Compiler->om.sizeofReferenceField(); + else + elementSize = (int32_t) TR::Compiler->om.sizeofReferenceAddress(); + + // Sign extend non-64bit Integers on LinuxPPC64 as required by the ABI + if (TR::Compiler->target.isLinux() && TR::Compiler->target.is64Bit()) + { + generateTrg1Src1Instruction(cg, TR::InstOpCode::extsw, node, lengthReg, lengthReg); + } + + // The C routine expects length measured by slots. + generateShiftRightLogicalImmediate(cg, node, lengthReg, lengthReg, trailingZeroes(elementSize)); + + generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, node, r3Reg, metaReg); + + TR::RegisterDependencyConditions *helperDeps = new (cg->trHeapMemory()) TR::RegisterDependencyConditions(0, 0, cg->trMemory()); + TR::SymbolReference *helperSym = comp->getSymRefTab()->findOrCreateRuntimeHelper(TR_referenceArrayCopy, false, false, false); + + gcPoint = generateDepImmSymInstruction(cg, TR::InstOpCode::bl, node, (uintptrj_t)helperSym->getMethodAddress(), helperDeps, helperSym); + gcPoint->PPCNeedsGCMap(pp.getPreservedRegisterMapForGC()); + + generateDepLabelInstruction(cg, TR::InstOpCode::label, node, endLabel, deps); + + TR::TreeEvaluator::genWrtbarForArrayCopy(node, srcObjReg, dstObjReg, cg); + + cg->decReferenceCount(srcObjNode); + cg->decReferenceCount(dstObjNode); + cg->decReferenceCount(srcAddrNode); + cg->decReferenceCount(dstAddrNode); + cg->decReferenceCount(lengthNode); + + if (stopUsingCopyReg1) + cg->stopUsingRegister(srcObjReg); + if (stopUsingCopyReg2) + cg->stopUsingRegister(dstObjReg); + if (stopUsingCopyReg3) + cg->stopUsingRegister(srcAddrReg); + if (stopUsingCopyReg4) + cg->stopUsingRegister(dstAddrReg); + if (stopUsingCopyReg5) + cg->stopUsingRegister(lengthReg); + + if (tmp1Reg) + cg->stopUsingRegister(tmp1Reg); + if (tmp2Reg) + cg->stopUsingRegister(tmp2Reg); + if (tmp3Reg) + cg->stopUsingRegister(tmp3Reg); + if (tmp4Reg) + cg->stopUsingRegister(tmp4Reg); + if (fp1Reg) + cg->stopUsingRegister(fp1Reg); + if (fp2Reg) + cg->stopUsingRegister(fp2Reg); + if (fp3Reg) + cg->stopUsingRegister(fp3Reg); + if (fp4Reg) + cg->stopUsingRegister(fp4Reg); + + if (r3Reg) + cg->stopUsingRegister(r3Reg); + if (condReg) + cg->stopUsingRegister(condReg); + + cg->machine()->setLinkRegisterKilled(true); + cg->setHasCall(); + + return NULL; + } + } + + /* + * This path also generates code to perform a runtime check on whether concurrent GC is done moving objects or not. + * If it isn't done, once again a call to referenceArrayCopy helper should be made. + * If it is done, using the assembly helpers code path is okay. + */ + + TR::Register *condReg = cg->allocateRegister(TR_CCR); + + // These registers are used when taking the assembly helpers path + TR::Register *tmp1Reg = cg->allocateRegister(TR_GPR); + TR::Register *tmp2Reg = cg->allocateRegister(TR_GPR); + TR::Register *tmp3Reg = cg->allocateRegister(TR_GPR); + TR::Register *tmp4Reg = cg->allocateRegister(TR_GPR); + TR::Register *fp1Reg = NULL; + TR::Register *fp2Reg = NULL; + TR::Register *vec0Reg = NULL; + TR::Register *vec1Reg = NULL; + + // These registers are used when taking the referenceArrayCopy helper call path. + TR::Register *r3Reg = cg->allocateRegister(); + TR::Register *r4Reg = cg->allocateRegister(); + TR::Register *metaReg = cg->getMethodMetaDataRegister(); + + // This section calculates the number of dependencies needed by the assembly helpers path. + static bool disableVSXArrayCopy = (feGetEnv("TR_disableVSXArrayCopy") != NULL); + static bool disableLEArrayCopyHelper = (feGetEnv("TR_disableLEArrayCopyHelper") != NULL); + static bool disableVSXArrayCopyInlining = (feGetEnv("TR_enableVSXArrayCopyInlining") == NULL); // Disabling due to a performance regression + + TR_Processor processor = TR::Compiler->target.cpu.id(); + + bool supportsVSX = (processor >= TR_PPCp8) && !disableVSXArrayCopy && TR::Compiler->target.cpu.getPPCSupportsVSX(); + bool supportsLEArrayCopy = !disableLEArrayCopyHelper && TR::Compiler->target.cpu.isLittleEndian() && TR::Compiler->target.cpu.hasFPU(); + +#if defined(DEBUG) || defined(PROD_WITH_ASSUMES) + static bool verboseArrayCopy = (feGetEnv("TR_verboseArrayCopy") != NULL); //Check which helper is getting used. + if (verboseArrayCopy) + fprintf(stderr, "arraycopy [0x%p] isReferenceArrayCopy:[%d] isForwardArrayCopy:[%d] isHalfWordElementArrayCopy:[%d] isWordElementArrayCopy:[%d] %s @ %s\n", + node, + 0, + node->isForwardArrayCopy(), + node->isHalfWordElementArrayCopy(), + node->isWordElementArrayCopy(), + comp->signature(), + comp->getHotnessName(comp->getMethodHotness()) + ); +#endif + + /* + * The minimum number of dependencies used by the assembly helpers path is 8. + * The number of dependencies added by the referenceArrayCopy helper call path is 5. + */ + int32_t numDeps = 8 + 5; + + if (processor >= TR_PPCp8 && supportsVSX) + { + vec0Reg = cg->allocateRegister(TR_VRF); + vec1Reg = cg->allocateRegister(TR_VRF); + numDeps += 2; + if (TR::Compiler->target.is32Bit()) + { + numDeps += 1; + } + if (supportsLEArrayCopy) + { + fp1Reg = cg->allocateSinglePrecisionRegister(); + fp2Reg = cg->allocateSinglePrecisionRegister(); + numDeps += 2; + if (disableVSXArrayCopyInlining) + { + numDeps += 2; + } + } + } + else if (TR::Compiler->target.is32Bit()) + { + numDeps += 1; + if (TR::Compiler->target.cpu.hasFPU()) + { + numDeps += 4; + } + } + else if (processor >= TR_PPCp6) + { + numDeps += 4; + } + + TR::RegisterDependencyConditions *deps = new (cg->trHeapMemory()) TR::RegisterDependencyConditions(numDeps, numDeps, cg->trMemory()); + + /* + * Build up the dependency conditions for assembly helper path. + * Unfortunately, the two different paths have a conflict regarding which real register they want srcAddrReg, dstAddrReg and lengthReg in. + * Dependencies are set up to favour the fast assembly path. Register moves are used in the slow helper path to move the values to the + * real registers they are expected to be in. + */ + addDependency(deps, condReg, TR::RealRegister::cr0, TR_CCR, cg); + + addDependency(deps, lengthReg, TR::RealRegister::gr7, TR_GPR, cg); + addDependency(deps, srcAddrReg, TR::RealRegister::gr8, TR_GPR, cg); + addDependency(deps, dstAddrReg, TR::RealRegister::gr9, TR_GPR, cg); + + addDependency(deps, tmp1Reg, TR::RealRegister::gr5, TR_GPR, cg); + addDependency(deps, tmp2Reg, TR::RealRegister::gr6, TR_GPR, cg); + addDependency(deps, tmp3Reg, TR::RealRegister::gr0, TR_GPR, cg); + addDependency(deps, tmp4Reg, TR::RealRegister::gr11, TR_GPR, cg); // Trampoline kills gr11. + + if (processor >= TR_PPCp8 && supportsVSX) + { + addDependency(deps, vec0Reg, TR::RealRegister::vr0, TR_VRF, cg); + addDependency(deps, vec1Reg, TR::RealRegister::vr1, TR_VRF, cg); + if (TR::Compiler->target.is32Bit()) + { + addDependency(deps, NULL, TR::RealRegister::gr10, TR_GPR, cg); + } + if (supportsLEArrayCopy) + { + addDependency(deps, fp1Reg, TR::RealRegister::fp8, TR_FPR, cg); + addDependency(deps, fp2Reg, TR::RealRegister::fp9, TR_FPR, cg); + if (disableVSXArrayCopyInlining) + { + addDependency(deps, NULL, TR::RealRegister::fp10, TR_FPR, cg); + addDependency(deps, NULL, TR::RealRegister::fp11, TR_FPR, cg); + } + } + } + else if (TR::Compiler->target.is32Bit()) + { + addDependency(deps, NULL, TR::RealRegister::gr10, TR_GPR, cg); + if (TR::Compiler->target.cpu.hasFPU()) + { + addDependency(deps, NULL, TR::RealRegister::fp8, TR_FPR, cg); + addDependency(deps, NULL, TR::RealRegister::fp9, TR_FPR, cg); + addDependency(deps, NULL, TR::RealRegister::fp10, TR_FPR, cg); + addDependency(deps, NULL, TR::RealRegister::fp11, TR_FPR, cg); + } + } + else if (processor >= TR_PPCp6) + { + // stfdp arrayCopy used + addDependency(deps, NULL, TR::RealRegister::fp8, TR_FPR, cg); + addDependency(deps, NULL, TR::RealRegister::fp9, TR_FPR, cg); + addDependency(deps, NULL, TR::RealRegister::fp10, TR_FPR, cg); + addDependency(deps, NULL, TR::RealRegister::fp11, TR_FPR, cg); + } + + // Add dependencies for the referenceArrayCopy helper call path. + addDependency(deps, r3Reg, TR::RealRegister::gr3, TR_GPR, cg); + addDependency(deps, r4Reg, TR::RealRegister::gr4, TR_GPR, cg); + + addDependency(deps, srcObjReg, TR::RealRegister::NoReg, TR_GPR, cg); + addDependency(deps, dstObjReg, TR::RealRegister::NoReg, TR_GPR, cg); + addDependency(deps, metaReg, TR::RealRegister::NoReg, TR_GPR, cg); + + TR::LabelSymbol *startLabel = generateLabelSymbol(cg); + TR::LabelSymbol *helperLabel = generateLabelSymbol(cg); + TR::LabelSymbol *endLabel = generateLabelSymbol(cg); + startLabel->setStartInternalControlFlow(); + endLabel->setEndInternalControlFlow(); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, startLabel); + + // Runtime check for concurrent scavenger. + generateTrg1MemInstruction(cg, TR::InstOpCode::Op_load, node, tmp1Reg, + new (cg->trHeapMemory()) TR::MemoryReference(metaReg, offsetof(J9VMThread, readBarrierRangeCheckTop), TR::Compiler->om.sizeofReferenceAddress(), cg)); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpli, node, condReg, tmp1Reg, 0); + generateConditionalBranchInstruction(cg, TR::InstOpCode::bne, node, helperLabel, condReg); + + // Start of assembly helper path. + if (!disableVSXArrayCopyInlining && supportsVSX) + { + inlineVSXArrayCopy(node, srcAddrReg, dstAddrReg, lengthReg, condReg, + tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg, fp1Reg, fp2Reg, + vec0Reg, vec1Reg, supportsLEArrayCopy, deps, cg); + } + else + { + TR_RuntimeHelper helper; + + if (node->isForwardArrayCopy()) + { + if (processor >= TR_PPCp8 && supportsVSX) + { + helper = TR_PPCforwardQuadWordArrayCopy_vsx; + } + else if (node->isWordElementArrayCopy()) + { + if (processor >= TR_PPCp6) + helper = TR_PPCforwardWordArrayCopy_dp; + else + helper = TR_PPCforwardWordArrayCopy; + } + else if (node->isHalfWordElementArrayCopy()) + { + if (processor >= TR_PPCp6 ) + helper = TR_PPCforwardHalfWordArrayCopy_dp; + else + helper = TR_PPCforwardHalfWordArrayCopy; + } + else + { + if (processor >= TR_PPCp6) + helper = TR_PPCforwardArrayCopy_dp; + else + { + helper = TR_PPCforwardArrayCopy; + } + } + } + else // We are not sure it is forward or we have to do backward. + { + if(processor >= TR_PPCp8 && supportsVSX) + { + helper = TR_PPCquadWordArrayCopy_vsx; + } + else if (node->isWordElementArrayCopy()) + { + if (processor >= TR_PPCp6) + helper = TR_PPCwordArrayCopy_dp; + else + helper = TR_PPCwordArrayCopy; + } + else if (node->isHalfWordElementArrayCopy()) + { + if (processor >= TR_PPCp6) + helper = TR_PPChalfWordArrayCopy_dp; + else + helper = TR_PPChalfWordArrayCopy; + } + else + { + if (processor >= TR_PPCp6) + helper = TR_PPCarrayCopy_dp; + else + { + helper = TR_PPCarrayCopy; + } + } + } + TR::TreeEvaluator::generateHelperBranchAndLinkInstruction(helper, node, deps, cg); + } + + generateLabelInstruction(cg, TR::InstOpCode::b, node, endLabel); + + // Start of referenceArrayCopy helper path. + generateLabelInstruction(cg, TR::InstOpCode::label, node, helperLabel); + + TR::PPCJNILinkage *jniLinkage = (TR::PPCJNILinkage*) cg->getLinkage(TR_J9JNILinkage); + const TR::PPCLinkageProperties &pp = jniLinkage->getProperties(); + + int32_t elementSize; + if (comp->useCompressedPointers()) + elementSize = TR::Compiler->om.sizeofReferenceField(); + else + elementSize = (int32_t) TR::Compiler->om.sizeofReferenceAddress(); + + // Sign extend non-64bit Integers on LinuxPPC64 as required by the ABI + if (TR::Compiler->target.isLinux() && TR::Compiler->target.is64Bit()) + { + generateTrg1Src1Instruction(cg, TR::InstOpCode::extsw, node, lengthReg, lengthReg); + } + + // The C routine expects length measured by slots. + generateShiftRightLogicalImmediate(cg, node, lengthReg, lengthReg, trailingZeroes(elementSize)); + + /* + * Parameters are set up here + * r3 = vmThread + * r4 = srcObj + * r5 = dstObj + * r6 = srcAddr + * r7 = dstAddr + * r8 = length + * + * CAUTION: Virtual register names are based on their use during the non-helper path so they are misleading after this point. + * Due to register reuse, pay attention to copying order so that a register is not clobbered too early. + */ + generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, node, r3Reg, metaReg); + generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, node, r4Reg, srcObjReg); + generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, node, tmp1Reg, dstObjReg); //tmp1Reg is tied to r5. + generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, node, tmp2Reg, srcAddrReg); //tmp2Reg is tied to r6. + generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, node, srcAddrReg, lengthReg); //srcAddrReg is tied to r8. Need to copy srcAddrReg first. + generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, node, lengthReg, dstAddrReg); //lengthReg is tied to r7. Need to copy lengthReg first. + + TR::RegisterDependencyConditions *helperDeps = new (cg->trHeapMemory()) TR::RegisterDependencyConditions(0, 0, cg->trMemory()); + TR::SymbolReference *helperSym = comp->getSymRefTab()->findOrCreateRuntimeHelper(TR_referenceArrayCopy, false, false, false); + + gcPoint = generateDepImmSymInstruction(cg, TR::InstOpCode::bl, node, (uintptrj_t)helperSym->getMethodAddress(), helperDeps, helperSym); + gcPoint->PPCNeedsGCMap(pp.getPreservedRegisterMapForGC()); + + generateDepLabelInstruction(cg, TR::InstOpCode::label, node, endLabel, deps); + + TR::TreeEvaluator::genWrtbarForArrayCopy(node, srcObjReg, dstObjReg, cg); + + cg->decReferenceCount(srcObjNode); + cg->decReferenceCount(dstObjNode); + cg->decReferenceCount(srcAddrNode); + cg->decReferenceCount(dstAddrNode); + cg->decReferenceCount(lengthNode); + + if (stopUsingCopyReg1) + cg->stopUsingRegister(srcObjReg); + if (stopUsingCopyReg2) + cg->stopUsingRegister(dstObjReg); + if (stopUsingCopyReg3) + cg->stopUsingRegister(srcAddrReg); + if (stopUsingCopyReg4) + cg->stopUsingRegister(dstAddrReg); + if (stopUsingCopyReg5) + cg->stopUsingRegister(lengthReg); + if (tmp1Reg) + cg->stopUsingRegister(tmp1Reg); + if (tmp2Reg) + cg->stopUsingRegister(tmp2Reg); + if (tmp3Reg) + cg->stopUsingRegister(tmp3Reg); + if (tmp4Reg) + cg->stopUsingRegister(tmp4Reg); + if (fp1Reg) + cg->stopUsingRegister(fp1Reg); + if (fp2Reg) + cg->stopUsingRegister(fp2Reg); + if (vec0Reg) + cg->stopUsingRegister(vec0Reg); + if (vec1Reg) + cg->stopUsingRegister(vec1Reg); + if (r3Reg) + cg->stopUsingRegister(r3Reg); + if (r4Reg) + cg->stopUsingRegister(r4Reg); + if (condReg) + cg->stopUsingRegister(condReg); + + cg->machine()->setLinkRegisterKilled(true); + cg->setHasCall(); + + return NULL; +#else /* OMR_GC_CONCURRENT_SCAVENGER */ + return OMR::TreeEvaluatorConnector::arraycopyEvaluator(node, cg); +#endif /* OMR_GC_CONCURRENT_SCAVENGER */ + } #else /* TR_TARGET_POWER */ // the following is to force an export to keep ilib happy diff --git a/runtime/compiler/p/codegen/J9TreeEvaluator.hpp b/runtime/compiler/p/codegen/J9TreeEvaluator.hpp index 02bfacdabba..e1d56c9a9f2 100644 --- a/runtime/compiler/p/codegen/J9TreeEvaluator.hpp +++ b/runtime/compiler/p/codegen/J9TreeEvaluator.hpp @@ -1,5 +1,5 @@ /******************************************************************************* - * Copyright (c) 2000, 2018 IBM Corp. and others + * Copyright (c) 2000, 2019 IBM Corp. and others * * This program and the accompanying materials are made available under * the terms of the Eclipse Public License 2.0 which accompanies this @@ -79,6 +79,9 @@ class OMR_EXTENSIBLE TreeEvaluator: public J9::TreeEvaluator static TR::Register *tstartEvaluator(TR::Node *node, TR::CodeGenerator *cg); static TR::Register *tfinishEvaluator(TR::Node *node, TR::CodeGenerator *cg); static TR::Register *tabortEvaluator(TR::Node *node, TR::CodeGenerator *cg); + static TR::Register *arraycopyEvaluator(TR::Node *node, TR::CodeGenerator *cg); + static TR::Register *irdbarEvaluator(TR::Node *node, TR::CodeGenerator *cg); + static TR::Register *ardbarEvaluator(TR::Node *node, TR::CodeGenerator *cg); static void restoreTOCRegister(TR::Node *node, TR::CodeGenerator *cg, TR::RegisterDependencyConditions *dependencies); static void buildArgsProcessFEDependencies(TR::Node *node, TR::CodeGenerator *cg, TR::RegisterDependencyConditions *dependencies); diff --git a/runtime/compiler/runtime/Runtime.cpp b/runtime/compiler/runtime/Runtime.cpp index b9d13306abf..ac4ab440b70 100644 --- a/runtime/compiler/runtime/Runtime.cpp +++ b/runtime/compiler/runtime/Runtime.cpp @@ -999,7 +999,7 @@ void initializeCodeRuntimeHelperTable(J9JITConfig *jitConfig, char isSMP) SET(TR_flushGPU, (void *)flushGPU, TR_Helper); SET(TR_callGPU, (void *)callGPU, TR_Helper); -#if defined(TR_HOST_X86) +#if defined(TR_HOST_X86) || defined(TR_HOST_POWER) SET(TR_referenceArrayCopy, (void *)jitReferenceArrayCopy, TR_Helper); #endif @@ -1048,7 +1048,7 @@ void initializeCodeRuntimeHelperTable(J9JITConfig *jitConfig, char isSMP) SET(TR_typeCheckArrayStore, (void *)jitTypeCheckArrayStoreWithNullCheck, TR_Helper); #endif -#if defined (TR_HOST_X86) +#if defined(TR_HOST_X86) || defined(TR_HOST_POWER) SET(TR_readBarrier, (void *)jitReadBarrier, TR_Helper); #endif SET(TR_writeBarrierStore, (void *)jitWriteBarrierStore, TR_Helper); diff --git a/runtime/gc_glue_java/configure_includes/configure_aix_ppc.mk b/runtime/gc_glue_java/configure_includes/configure_aix_ppc.mk index 2091120af6c..d7b42a5f7e4 100644 --- a/runtime/gc_glue_java/configure_includes/configure_aix_ppc.mk +++ b/runtime/gc_glue_java/configure_includes/configure_aix_ppc.mk @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2016, 2018 IBM Corp. and others +# Copyright (c) 2016, 2019 IBM Corp. and others # # This program and the accompanying materials are made available under # the terms of the Eclipse Public License 2.0 which accompanies this @@ -33,6 +33,7 @@ ifeq (aix_ppc-64_cmprssptrs, $(SPEC)) --enable-OMR_ARCH_POWER \ --enable-OMR_ENV_DATA64 \ --enable-OMR_GC_COMPRESSED_POINTERS \ + --enable-OMR_GC_CONCURRENT_SCAVENGER \ --enable-OMR_INTERP_COMPRESSED_OBJECT_HEADER \ --enable-OMR_INTERP_SMALL_MONITOR_SLOT \ --enable-OMR_PORT_CAN_RESERVE_SPECIFIC_ADDRESS @@ -73,6 +74,7 @@ ifeq (aix_ppc-64, $(SPEC)) --enable-OMRTHREAD_LIB_AIX \ --enable-OMR_ARCH_POWER \ --enable-OMR_ENV_DATA64 \ + --enable-OMR_GC_CONCURRENT_SCAVENGER \ --enable-OMR_PORT_CAN_RESERVE_SPECIFIC_ADDRESS endif diff --git a/runtime/gc_glue_java/configure_includes/configure_linux_ppc.mk b/runtime/gc_glue_java/configure_includes/configure_linux_ppc.mk index 5c8690d8ec6..ac310fe2e62 100644 --- a/runtime/gc_glue_java/configure_includes/configure_linux_ppc.mk +++ b/runtime/gc_glue_java/configure_includes/configure_linux_ppc.mk @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2016, 2018 IBM Corp. and others +# Copyright (c) 2016, 2019 IBM Corp. and others # # This program and the accompanying materials are made available under # the terms of the Eclipse Public License 2.0 which accompanies this @@ -34,6 +34,7 @@ ifeq (linux_ppc-64_cmprssptrs_le, $(SPEC)) --enable-OMR_ENV_DATA64 \ --enable-OMR_ENV_LITTLE_ENDIAN \ --enable-OMR_GC_COMPRESSED_POINTERS \ + --enable-OMR_GC_CONCURRENT_SCAVENGER \ --enable-OMR_GC_IDLE_HEAP_MANAGER \ --enable-OMR_INTERP_COMPRESSED_OBJECT_HEADER \ --enable-OMR_INTERP_SMALL_MONITOR_SLOT \ @@ -49,6 +50,7 @@ ifeq (linux_ppc-64_cmprssptrs_le_gcc, $(SPEC)) --enable-OMR_ENV_GCC \ --enable-OMR_ENV_LITTLE_ENDIAN \ --enable-OMR_GC_COMPRESSED_POINTERS \ + --enable-OMR_GC_CONCURRENT_SCAVENGER \ --enable-OMR_GC_IDLE_HEAP_MANAGER \ --enable-OMR_INTERP_COMPRESSED_OBJECT_HEADER \ --enable-OMR_INTERP_SMALL_MONITOR_SLOT \ @@ -127,6 +129,7 @@ ifeq (linux_ppc-64_le, $(SPEC)) --enable-OMR_ARCH_POWER \ --enable-OMR_ENV_DATA64 \ --enable-OMR_ENV_LITTLE_ENDIAN \ + --enable-OMR_GC_CONCURRENT_SCAVENGER \ --enable-OMR_GC_IDLE_HEAP_MANAGER \ --enable-OMR_PORT_CAN_RESERVE_SPECIFIC_ADDRESS \ --enable-OMR_PORT_NUMA_SUPPORT @@ -139,6 +142,7 @@ ifeq (linux_ppc-64_le_gcc, $(SPEC)) --enable-OMR_ENV_DATA64 \ --enable-OMR_ENV_GCC \ --enable-OMR_ENV_LITTLE_ENDIAN \ + --enable-OMR_GC_CONCURRENT_SCAVENGER \ --enable-OMR_GC_IDLE_HEAP_MANAGER \ --enable-OMR_PORT_CAN_RESERVE_SPECIFIC_ADDRESS \ --enable-OMR_PORT_NUMA_SUPPORT diff --git a/runtime/gc_modron_startup/mminit.cpp b/runtime/gc_modron_startup/mminit.cpp index ddc178eb3c7..3ecb889e578 100644 --- a/runtime/gc_modron_startup/mminit.cpp +++ b/runtime/gc_modron_startup/mminit.cpp @@ -2952,9 +2952,9 @@ gcInitializeDefaults(J9JavaVM* vm) if (gc_policy_gencon == extensions->configurationOptions._gcPolicy) { /* after we parsed cmd line options, check if we can obey the request to run CS (valid for Gencon only) */ if (extensions->concurrentScavengerForced) { -#if defined(J9VM_ARCH_X86) +#if defined(J9VM_ARCH_X86) || defined(J9VM_ARCH_POWER) extensions->softwareRangeCheckReadBarrier = true; -#endif /* J9VM_ARCH_X86 */ +#endif /* J9VM_ARCH_X86 || J9VM_ARCH_POWER */ if (LOADED == (FIND_DLL_TABLE_ENTRY(J9_JIT_DLL_NAME)->loadFlags & LOADED)) { /* If running jitted, it must be on supported h/w */ J9ProcessorDesc processorDesc; diff --git a/runtime/jilgen/jilconsts.c b/runtime/jilgen/jilconsts.c index 88e040a1288..8343bd1dd44 100644 --- a/runtime/jilgen/jilconsts.c +++ b/runtime/jilgen/jilconsts.c @@ -480,6 +480,7 @@ writeConstants(OMRPortLibrary *OMRPORTLIB, IDATA fd) writeConstant(OMRPORTLIB, fd, "J9TR_JavaVM_compressedPointersShift", offsetof(J9JavaVM, compressedPointersShift)) | #endif /* J9VM_GC_COMPRESSED_POINTERS */ writeConstant(OMRPORTLIB, fd, "J9TR_J9MemoryManagerFunctions_J9ReadBarrier", offsetof(J9MemoryManagerFunctions, J9ReadBarrier)) | + writeConstant(OMRPORTLIB, fd, "J9TR_J9MemoryManagerFunctions_referenceArrayCopy", offsetof(J9MemoryManagerFunctions, referenceArrayCopy)) | /* J9VMEntryLocalStorage */ writeConstant(OMRPORTLIB, fd, "J9TR_ELS_jitGlobalStorageBase", offsetof(J9VMEntryLocalStorage, jitGlobalStorageBase)) | writeConstant(OMRPORTLIB, fd, "J9TR_ELS_jitFPRegisterStorageBase", offsetof(J9VMEntryLocalStorage, jitFPRegisterStorageBase)) |