Skip to content

Commit

Permalink
publish from 07ff8974 (gopub)
Browse files Browse the repository at this point in the history
  • Loading branch information
huiqiwa committed Feb 23, 2023
1 parent b649944 commit 0281ccc
Show file tree
Hide file tree
Showing 288 changed files with 2,383 additions and 952 deletions.
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,10 @@ You may get the latest installers or binaries in [Releases](https://github.com/i
## Documentation
* Refer to the [XPU Manager Installation Guide](doc/Install_guide.md) and for how to install/uninstall XPU Manager.
* Refer to the [XPU-SMI Installation Guide](doc/smi_install_guide.md) and for how to install/uninstall XPU-SMI.
* Refer to the [XPU Manager CLI User Guide](doc/CLI_user_guide.md) to start using XPU Manager.
* Refer to the [XPU-SMI CLI User Guide](doc/smi_user_guide.md) to start using XPU-SMI.
* Refer to the [XPU Manager amcmcli User Guide](doc/amcmcli_user_guide.md) to start using XPU Manager amcmcli.
* Refer to the [XPU Manager CLI User Guide](doc/CLI_user_guide.md) to start to use XPU Manager.
* Refer to the [XPU-SMI CLI User Guide](doc/smi_user_guide.md) to start to use XPU-SMI.
* Refer to the [XPU Manager Windows CLI User Guide](doc/xpum_win_user_guide.md) to start to use XPU Manager Windows CLI.
* Refer to the [XPU Manager amcmcli User Guide](doc/amcmcli_user_guide.md) to start to use XPU Manager amcmcli.
* Refer to [DockerHub](https://hub.docker.com/r/intel/xpumanager) for a Docker container image that can be used as a Prometheus exporter in a Kubernetes environment.
* Refer to [Building XPU Manager Installer](BUILDING.md) to build XPU Manager installer packages.

Expand Down
24 changes: 0 additions & 24 deletions REST_README.md

This file was deleted.

2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.2.2
1.2.3
12 changes: 1 addition & 11 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,4 @@ cmake .. $ccache_opts $prefix_default $@
make -j4

echo "---------Create installation package-----------"
cpack

if [ -f ~/password.sys_dcm ]; then
PackageName=$(cat package_file_name)
CSUser="ccr\\sys_dcm"
CSPwd=$(cat ~/password.sys_dcm)
echo "SignFile:${PackageName}"
pushd "${WORK_DIR}"/install/tools/signfile
./SignFile -vv -u "${CSUser}" -p "${CSPwd}" "${WORK_DIR}"/build/${PackageName}
popd
fi
cpack
2 changes: 1 addition & 1 deletion builder/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,5 +53,5 @@ sphinxcontrib-qthelp==1.0.3
sphinxcontrib-serializinghtml==1.1.5
typing-extensions==4.1.1
urllib3==1.26.9
Werkzeug==2.0.3
Werkzeug==2.2.3
zipp==3.6.0
1 change: 1 addition & 0 deletions cli/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ include_directories(/usr/local/include)
include_directories(${CMAKE_CURRENT_LIST_DIR}/src)
include_directories(${CMAKE_CURRENT_LIST_DIR}/../core/include)
include_directories(${CMAKE_CURRENT_LIST_DIR}/../core/src/api)
include_directories(${CMAKE_CURRENT_LIST_DIR}/../core/src/amc)
include_directories(${CMAKE_CURRENT_LIST_DIR}/../third_party/CLI11/include)
include_directories(${CMAKE_CURRENT_LIST_DIR}/../third_party/json/include)
if(EXISTS ${CMAKE_CURRENT_LIST_DIR}/test)
Expand Down
4 changes: 3 additions & 1 deletion cli/src/cli_wrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,10 @@ int CLIWrapper::printResult(std::ostream &out) {
setenv("XPUM_METRICS", "0,4-38", 1);
else if (dump_comlet->dumpEUMetrics())
setenv("XPUM_METRICS", "0-31,36-38", 1);
else
else if(dump_comlet->dumpRASMetrics())
setenv("XPUM_METRICS", "0,4-31,36-38", 1);
else
setenv("XPUM_METRICS", "0,4-19,29-31,36-38", 1);
}
if (comlet->getCommand().compare("dump") == 0 && std::dynamic_pointer_cast<ComletDump>(comlet)->dumpIdlePowerOnly()) {
this->coreStub = std::make_shared<LibCoreStub>(false);
Expand Down
131 changes: 108 additions & 23 deletions cli/src/comlet_diagnostic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "core_stub.h"
#include "utility.h"
#include "exit_code.h"
#include <unordered_map>

namespace xpum::cli {

Expand Down Expand Up @@ -56,26 +57,51 @@ static CharTableConfig ComletConfigDiagnosticDevice(R"({
}]
})"_json);

static CharTableConfig ComletConfigSpecificDiagnosticDevice(R"({
"showTitleRow": false,
"columns": [{
"title": "none"
}, {
"title": "none"
}],
"rows": [{
"instance": "",
"cells": [
{ "rowTitle": "Device ID" },
"device_id"
]
}, {
"instance": "component_list[]",
"cells": [
{ "value": "component_type" }, [
{ "label": "Result", "value": "result" },
{ "label": "Message", "value": "message" },
{ "value": "process_list[]", "subrow": true, "subs": [
{ "label": " PID", "value": "process_id" },
{ "label": "Command", "value": "process_name" }
]},
{ "value": "media_codec_list[]", "subrow": true, "subs": [
{ "label": "", "value": "fps" }
]}
]]
}]
})"_json);

static CharTableConfig ComletConfigDiagnosticPreCheck(R"({
"showTitleRow": true,
"columns": [{
"title": "Component",
"size": 16
}, {
"title": "Status"
"title": "Details"
}],
"rows": [{
"instance": "",
"cells": [[
{ "rowTitle": "GPU" },
{ "rowTitle": "Driver" },
{ "rowTitle": "GPU Status" },
{ "rowTitle": "CPU Status" }
], [
{ "value": "gpu_basic_info" },
{ "value": "gpu_driver_info" },
{ "value": "gpu_status_info" },
{ "value": "cpu_status_info" }
"instance": "component_list[]",
"cells": [
{ "value": "type" }, [
{ "value": "error_details[]", "subrow": true, "subs": [
{ "value": "field_value" }
]}
]]
}]
})"_json);
Expand Down Expand Up @@ -109,6 +135,16 @@ void ComletDiagnostic::setupOptions() {
auto stressFlag = addFlag("-s,--stress", this->opts->stress, "Stress the GPU(s) for the specified time");
auto stressTimeOpt = addOption("--stresstime", this->opts->stressTime, "Stress time (in minutes)");
auto preCheckOpt = addFlag("--precheck", this->opts->preCheck, "Do the precheck on the GPU and GPU driver");
auto onlyGPUOpt = addFlag("--gpu", this->opts->onlyGPU, "Show the GPU status only");

auto singleTestId = addOption("--singletest", this->opts->singleTestId,
"Selectively run a particular test\n\
1. Computation\n\
2. Memory Error\n\
3. Memory Bandwidth\n\
4. Media Codec\n\
5. PCIe Bandwidth\n\
6. Power");

preCheckOpt->excludes(deviceIdOpt);
preCheckOpt->excludes(level);
Expand All @@ -117,18 +153,22 @@ void ComletDiagnostic::setupOptions() {
level->excludes(preCheckOpt);
level->excludes(stressFlag);
level->excludes(stressTimeOpt);
level->excludes(singleTestId);
singleTestId->excludes(level);

deviceIdOpt->excludes(preCheckOpt);
if (stressFlag == nullptr) {
deviceIdOpt->needs(level);
}
stressTimeOpt->needs(stressFlag);

onlyGPUOpt->needs(preCheckOpt);

#ifndef DAEMONLESS
preCheckOpt->excludes(groupIdOpt);
groupIdOpt->excludes(preCheckOpt);
groupIdOpt->excludes(stressFlag);
groupIdOpt->excludes(stressTimeOpt);
groupIdOpt->needs(level);
stressFlag->needs(stressTimeOpt);
#endif
}
Expand All @@ -152,6 +192,13 @@ std::unique_ptr<nlohmann::json> ComletDiagnostic::run() {
(*json)["errno"] = XPUM_CLI_ERROR_DIAGNOSTIC_INVALID_LEVEL;
return json;
}

if (this->opts->singleTestId != INT_MIN && (this->opts->singleTestId < 1 || this->opts->singleTestId > 6)) {
(*json)["error"] = "invalid single test";
(*json)["errno"] = XPUM_CLI_ERROR_DIAGNOSTIC_INVALID_SINGLE_TEST;
return json;
}

if (this->opts->level >= 1 && this->opts->level <= 3) {
if (this->opts->deviceIds[0] != "-1") {
int targetId = -1;
Expand All @@ -163,19 +210,48 @@ std::unique_ptr<nlohmann::json> ComletDiagnostic::run() {
return convertResult;
}
}
json = this->coreStub->runDiagnostics(targetId, this->opts->level, this->opts->rawComponentTypeStr);
json = this->coreStub->runDiagnostics(targetId, this->opts->level, -1, this->opts->rawJson);
return json;
}
#ifndef DAEMONLESS
else if (this->opts->groupId > 0 && this->opts->groupId != UINT_MAX) {
json = this->coreStub->runDiagnosticsByGroup(this->opts->groupId, this->opts->level, this->opts->rawComponentTypeStr);
json = this->coreStub->runDiagnosticsByGroup(this->opts->groupId, this->opts->level, -1, this->opts->rawJson);
return json;
}
#endif
}

std::unordered_map<int, int> testIdToType = {{1, XPUM_DIAG_PERFORMANCE_COMPUTATION},
{2, XPUM_DIAG_MEMORY_ERROR},
{3, XPUM_DIAG_PERFORMANCE_MEMORY_BANDWIDTH},
{4, XPUM_DIAG_MEDIA_CODEC},
{5, XPUM_DIAG_INTEGRATION_PCIE},
{6, XPUM_DIAG_PERFORMANCE_POWER}};

if (this->opts->singleTestId >= 1 && this->opts->singleTestId <= 6) {
if (this->opts->deviceIds[0] != "-1") {
int targetId = -1;
if (isNumber(this->opts->deviceIds[0])) {
targetId = std::stoi(this->opts->deviceIds[0]);
} else {
auto convertResult = this->coreStub->getDeivceIdByBDF(this->opts->deviceIds[0].c_str(), &targetId);
if (convertResult->contains("error")) {
return convertResult;
}
}
json = this->coreStub->runDiagnostics(targetId, -1, testIdToType[this->opts->singleTestId], this->opts->rawJson);
return json;
}
#ifndef DAEMONLESS
else if (this->opts->groupId > 0 && this->opts->groupId != UINT_MAX) {
json = this->coreStub->runDiagnosticsByGroup(this->opts->groupId, -1, testIdToType[this->opts->singleTestId], this->opts->rawJson);
return json;
}
#endif
}

if (this->opts->preCheck) {
json = this->coreStub->getPreCheckInfo();
json = this->coreStub->getPreCheckInfo(this->opts->onlyGPU, this->opts->rawJson);
return json;
}

Expand Down Expand Up @@ -204,10 +280,13 @@ std::unique_ptr<nlohmann::json> ComletDiagnostic::run() {
return json;
}

static void showDeviceDiagnostic(std::ostream &out, std::shared_ptr<nlohmann::json> json, bool precheck = false, const bool cont = false) {
if (precheck) {
static void showDeviceDiagnostic(std::ostream &out, std::shared_ptr<nlohmann::json> json, ShowMode mode, const bool cont = false) {
if (mode == PRE_CHECK) {
CharTable table(ComletConfigDiagnosticPreCheck, *json, cont);
table.show(out);
} else if (mode == SINGLE_TEST) {
CharTable table(ComletConfigSpecificDiagnosticDevice, *json, cont);
table.show(out);
} else {
CharTable table(ComletConfigDiagnosticDevice, *json, cont);
table.show(out);
Expand All @@ -232,9 +311,9 @@ static void showStreesedDevices(std::ostream &out, const std::vector<std::string
}

void ComletDiagnostic::getTableResult(std::ostream &out) {
this->opts->rawComponentTypeStr = false;
this->opts->rawJson = false;
auto res = run();
this->opts->rawComponentTypeStr = true;
this->opts->rawJson = true;
if (res->contains("error")) {
out << "Error: " << (*res)["error"].get<std::string>() << std::endl;
setExitCodeByJson(*res);
Expand All @@ -248,7 +327,10 @@ void ComletDiagnostic::getTableResult(std::ostream &out) {
auto devices = (*json)["device_list"].get<std::vector<nlohmann::json>>();
bool cont = false;
for (auto device : devices) {
showDeviceDiagnostic(out, std::make_shared<nlohmann::json>(device), false, cont);
if (this->opts->level >= 1 && this->opts->level <= 3)
showDeviceDiagnostic(out, std::make_shared<nlohmann::json>(device), LEVEL_TEST, cont);
else
showDeviceDiagnostic(out, std::make_shared<nlohmann::json>(device), SINGLE_TEST, cont);
cont = true;
}
return;
Expand Down Expand Up @@ -292,12 +374,15 @@ void ComletDiagnostic::getTableResult(std::ostream &out) {
}

if (isDeviceOperation()) {
showDeviceDiagnostic(out, json);
if (this->opts->level >= 1 && this->opts->level <= 3)
showDeviceDiagnostic(out, json, LEVEL_TEST);
else
showDeviceDiagnostic(out, json, SINGLE_TEST);
return;
}

if (this->opts->preCheck) {
showDeviceDiagnostic(out, json, true);
showDeviceDiagnostic(out, json, PRE_CHECK);
return;
}
}
Expand Down
10 changes: 9 additions & 1 deletion cli/src/comlet_diagnostic.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,20 @@ struct ComletDiagnosticOptions {
uint32_t groupId = UINT_MAX;
#endif
int level = INT_MIN;
bool rawComponentTypeStr = true;
int singleTestId = INT_MIN;
bool rawJson = true;
bool preCheck = false;
bool onlyGPU = false;
uint32_t stressTime = 0;
bool stress = false;
};

enum ShowMode {
LEVEL_TEST,
SINGLE_TEST,
PRE_CHECK
};

class ComletDiagnostic : public ComletBase {
public:
ComletDiagnostic() : ComletBase("diag", "Run some test suites to diagnose GPU.") {
Expand Down
11 changes: 11 additions & 0 deletions cli/src/comlet_dump.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,17 @@ bool ComletDump::dumpEUMetrics() {
return false;
}

bool ComletDump::dumpRASMetrics() {
for (auto id : this->opts->metricsIdList) {
if ((id >= xpum_dump_type_t::XPUM_DUMP_RAS_ERROR_CAT_RESET && id <= xpum_dump_type_t::XPUM_DUMP_RAS_ERROR_CAT_CACHE_ERRORS_UNCORRECTABLE)
|| id == xpum_dump_type_t::XPUM_DUMP_RAS_ERROR_CAT_NON_COMPUTE_ERRORS_CORRECTABLE
|| id == xpum_dump_type_t::XPUM_DUMP_RAS_ERROR_CAT_NON_COMPUTE_ERRORS_UNCORRECTABLE) {
return true;
}
}
return false;
}

static std::string getFileValue(std::string file_name) {
std::ifstream ifs(file_name);
std::string content((std::istreambuf_iterator<char>(ifs)), (std::istreambuf_iterator<char>()));
Expand Down
2 changes: 2 additions & 0 deletions cli/src/comlet_dump.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ class ComletDump : public ComletBase {

bool dumpEUMetrics();

bool dumpRASMetrics();

bool dumpIdlePowerOnly();

std::unique_ptr<nlohmann::json> combineTileAndDeviceLevel(nlohmann::json rawJson);
Expand Down
2 changes: 1 addition & 1 deletion cli/src/config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
#define CLI_VERSION_GIT_COMMIT "@GIT_COMMIT@"
#define CLI_VERSION_IN_HELP "@PROJECT_VERSION_IN_HELP@"

#define XPUM_CONFIG_DIR "@CPACK_PACKAGING_INSTALL_PREFIX@/config/"
#define XPUM_CONFIG_DIR "@XPUM_CONFIG_DIR@/"
Loading

0 comments on commit 0281ccc

Please sign in to comment.