Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

Commit

Permalink
Get log for each attempt. (#5106)
Browse files Browse the repository at this point in the history
Change log API from  `/jobs/{job-name}/pods/podUid/logs` to 
 `/jobs/{job-name}/taskRoles/{taskRoleName}/taskIndex/{taskIndex}/attempts/{taskAttemptId}/logs`
 
Previous, get log for each task attempt need to query DB: 
1. get job level info. 2. get job attempt level info 3. get task level info 4. get task attempt level info. 

After this change, only need to get task level info.
  • Loading branch information
Binyang2014 authored Nov 23, 2020
1 parent a604060 commit 2fb370a
Show file tree
Hide file tree
Showing 8 changed files with 95 additions and 72 deletions.
35 changes: 19 additions & 16 deletions src/rest-server/docs/swagger.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ info:
Version 2.1.1: add get event api
Version 2.2.0: add get task status api; add jobAttempId filter to job status api and extend job detail schema
Version 2.2.1: a user can add/delete tags to/from his/her own jobs
version 2.2.2: add get pod logs api
version 2.2.2: add get task logs api
license:
name: MIT License
url: "https://github.com/microsoft/pai/blob/master/LICENSE"
Expand Down Expand Up @@ -1741,7 +1741,7 @@ paths:
ssh: 37508
http: 24661
containerGpus: null
containerLog: /api/v2/jobs/admin~admin_444da84f/pods/07cdd036-1a7c-11eb-830b-000d3ab25bb6/logs
containerLog: /api/v2/jobs/admin~admin_444da84f/attempts/0/taskRoles/taskrole/taskIndex/0/attempts/2/logs
containerExitCode: -220
containerExitSpec:
code: -220
Expand All @@ -1768,7 +1768,7 @@ paths:
ssh: 37508
http: 24661
containerGpus: null
containerLog: /api/v2/jobs/admin~admin_444da84f/pods/07cdd036-1a7c-11eb-830b-000d3ab25bb6/logs
containerLog: /api/v2/jobs/admin~admin_444da84f/attempts/0/taskRoles/taskrole/taskIndex/0/attempts/2/logs
containerExitCode: -220
containerExitSpec:
code: -220
Expand All @@ -1787,19 +1787,22 @@ paths:
$ref: "#/components/responses/NoTaskError"
"500":
$ref: "#/components/responses/UnknownError"
"/api/v2/jobs/{user}~{job}/pods/{podUid}/logs":
"/api/v2/jobs/{user}~{job}/attempts/{jobAttemptId}/taskRoles/{taskRoleName}/taskIndex/{taskIndex}/attempts/{taskAttemptId}/logs":
get:
tags:
- job
summary: Get job pod log list.
description: Get job pod log list.
operationId: getPodLogs
summary: Get task log list.
description: Get task log list.
operationId: getTaskLogs
security:
- bearerAuth: []
parameters:
- $ref: "#/components/parameters/user"
- $ref: "#/components/parameters/job"
- $ref: "#/components/parameters/podUid"
- $ref: "#/components/parameters/jobAttemptId"
- $ref: "#/components/parameters/taskRoleName"
- $ref: "#/components/parameters/taskIndex"
- $ref: "#/components/parameters/taskAttemptId"
- name: tailMode
in: query
description: getting log content via tail mode. Could be "true" or "false"
Expand All @@ -1821,7 +1824,7 @@ paths:
- name: stdout
uri: "https://mater_ip/log-manager/node_ip/api/v1/logs/user.pai.stdout?username=user&framework-name=34775529adebae576fbc0bf48d835386&pod-uid=07cdd036-1a7c-11eb-830b-000d3ab25bb6&taskrole=taskrole&token=token"
"404":
$ref: "#/components/responses/NoPodLogsError"
$ref: "#/components/responses/NoTaskLogError"
"500":
$ref: "#/components/responses/UnknownError"
/api/v2/kubernetes/nodes:
Expand Down Expand Up @@ -1957,10 +1960,10 @@ components:
required: true
schema:
type: string
podUid:
name: podUid
taskAttemptId:
name: taskAttemptId
in: path
description: job pod uid
description: task attempt id
required: true
schema:
type: string
Expand Down Expand Up @@ -3287,17 +3290,17 @@ components:
value:
code: NoJobSshInfoError
message: "SSH info of job {job} is not found."
NoPodLogsError:
description: NoPodLogsError
NoTaskLogError:
description: NoTaskLogError
content:
application/json:
schema:
$ref: "#/components/schemas/Response"
examples:
NoJobSshInfoError:
value:
code: NoPodLogsError
message: "Logs for pod {podUid} is not found."
code: NoTaskLogError
message: "Log of task is not found."
ConflictUserError:
description: ConflictUserError
content:
Expand Down
7 changes: 5 additions & 2 deletions src/rest-server/src/controllers/v2/job.js
Original file line number Diff line number Diff line change
Expand Up @@ -293,13 +293,16 @@ const getLogs = asyncHandler(async (req, res) => {
try {
const data = await log.getLogListFromLogManager(
req.params.frameworkName,
req.params.podUid,
req.params.jobAttemptId,
req.params.taskRoleName,
req.params.taskIndex,
req.params.taskAttemptId,
req.query['tail-mode'],
);
res.json(data);
} catch (error) {
logger.error(`Got error when retrieving log list, error: ${error}`);
throw error.code === 'NoPodLogsError'
throw error.code === 'NoTaskLogErr'
? error
: createError(
'Internal Server Error',
Expand Down
15 changes: 12 additions & 3 deletions src/rest-server/src/models/v2/job/k8s.js
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,13 @@ const convertFrameworkSummary = (framework) => {
};
};

const convertTaskDetail = async (taskStatus, ports, frameworkName) => {
const convertTaskDetail = async (
taskName,
taskStatus,
jobAttemptId,
ports,
frameworkName,
) => {
// get containerPorts
const containerPorts = getContainerPorts(
ports,
Expand All @@ -91,6 +97,7 @@ const convertTaskDetail = async (taskStatus, ports, frameworkName) => {
const completionStatus = taskStatus.attemptStatus.completionStatus;
const diagnostics = completionStatus ? completionStatus.diagnostics : null;
const exitDiagnostics = generateExitDiagnostics(diagnostics);
const taskAttemptId = taskStatus.attemptStatus.id;
return {
taskIndex: taskStatus.index,
taskUid: taskStatus.instanceUID,
Expand All @@ -103,7 +110,7 @@ const convertTaskDetail = async (taskStatus, ports, frameworkName) => {
containerNodeName: taskStatus.attemptStatus.podNodeName,
containerPorts,
containerGpus,
containerLog: `/api/v2/jobs/${frameworkName}/pods/${taskStatus.attemptStatus.podUID}/logs`,
containerLog: `/api/v2/jobs/${frameworkName}/attempts/${jobAttemptId}/taskRoles/${taskName}/taskIndex/${taskStatus.index}/attempts/${taskAttemptId}/logs`,
containerExitCode: completionStatus ? completionStatus.code : null,
containerExitSpec: completionStatus
? generateExitSpec(completionStatus.code)
Expand All @@ -118,7 +125,7 @@ const convertTaskDetail = async (taskStatus, ports, frameworkName) => {
new Date(taskStatus.runTime || taskStatus.completionTime).getTime() ||
null,
completedTime: new Date(taskStatus.completionTime).getTime() || null,
attemptId: taskStatus.attemptStatus.id,
attemptId: taskAttemptId,
attemptState: convertAttemptState(
taskStatus.state || null,
completionStatus ? completionStatus.code : null,
Expand Down Expand Up @@ -286,7 +293,9 @@ const convertFrameworkDetail = async (
taskRoleStatus.taskStatuses.map(
async (status) =>
await convertTaskDetail(
taskRoleStatus.name,
status,
specifiedAttemptStatus.id,
ports[taskRoleStatus.name],
`${userName}~${jobName}`,
),
Expand Down
47 changes: 27 additions & 20 deletions src/rest-server/src/models/v2/job/log.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

const axios = require('axios');
const job = require('./k8s');
const logger = require('@pai/config/logger');
const task = require('@pai/models/v2/task');
const createError = require('@pai/utils/error');
const { encodeName } = require('@pai/models/v2/utils/name');

Expand All @@ -36,38 +36,45 @@ const loginLogManager = async (nodeIp, username, password) => {
});
};

const getLogListFromLogManager = async (frameworkName, podUid, tailMode) => {
const getLogListFromLogManager = async (
frameworkName,
jobAttemptId,
taskRoleName,
taskIndex,
taskAttemptId,
tailMode,
) => {
const adminName = process.env.LOG_MANAGER_ADMIN_NAME;
const adminPassword = process.env.LOG_MANAGER_ADMIN_PASSWORD;

const jobDetail = await job.get(frameworkName);
const noPodLogsErr = createError(
const taskDetail = await task.get(
frameworkName,
Number(jobAttemptId),
taskRoleName,
Number(taskIndex),
);
const NoTaskLogErr = createError(
'Not Found',
'NoPodLogsError',
`Logs for pod ${podUid} is not found.`,
'NoTaskLogError',
`Log of task is not found.`,
);
let nodeIp;
let taskRoleName;
for (const [key, taskRole] of Object.entries(jobDetail.taskRoles)) {
const status = taskRole.taskStatuses.find(
(status) => status.containerId === podUid,
);
if (!status) {
logger.error(`Failed to find pod which has pod uid ${podUid}`);
throw noPodLogsErr;
}
nodeIp = status.containerIp;
taskRoleName = key;
const taskStatus = taskDetail.data.attempts[Number(taskAttemptId)];
if (!taskStatus) {
logger.error(`Failed to find task to retrive log`);
throw NoTaskLogErr;
}

const nodeIp = taskStatus.containerIp;
const podUid = taskStatus.containerId;

let res = await loginLogManager(nodeIp, adminName, adminPassword);
const token = res.data.token;

const prefix = constrcutLogManagerPrefix(nodeIp);
try {
const params = {
token: token,
username: jobDetail.jobStatus.username,
username: taskDetail.data.username,
taskrole: taskRoleName,
};
params['framework-name'] = encodeName(frameworkName);
Expand All @@ -77,7 +84,7 @@ const getLogListFromLogManager = async (frameworkName, podUid, tailMode) => {
});
} catch (err) {
if (err.response && err.response.status === 404) {
throw noPodLogsErr;
throw NoTaskLogErr;
}
throw err;
}
Expand Down
2 changes: 1 addition & 1 deletion src/rest-server/src/models/v2/task.js
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ const get = async (frameworkName, jobAttemptIndex, taskRoleName, taskIndex) => {
);
if (taskRoleStatus) {
taskStatus = taskRoleStatus.taskStatuses.find(
(taskRoleStatus) => taskRoleStatus.index === taskIndex,
(taskStatus) => taskStatus.index === taskIndex,
);
}
if (taskStatus === undefined) {
Expand Down
47 changes: 23 additions & 24 deletions src/rest-server/src/models/v2/utils/frameworkConverter.js
Original file line number Diff line number Diff line change
Expand Up @@ -310,9 +310,6 @@ const convertToJobAttempt = async (framework) => {
framework.metadata.annotations,
);
const frameworkName = framework.metadata.name;
const logPathInfix = framework.metadata.annotations.logPathInfix
? framework.metadata.annotations.logPathInfix
: jobName;
const uid = framework.metadata.uid;
const userName = framework.metadata.labels
? framework.metadata.labels.userName
Expand Down Expand Up @@ -382,9 +379,10 @@ const convertToJobAttempt = async (framework) => {
taskRoleStatus.taskStatuses.map(
async (status) =>
await convertTaskDetail(
taskRoleStatus.name,
status,
userName,
logPathInfix,
attemptIndex,
`${userName}~${jobName}`,
taskRoleStatus.name,
true,
),
Expand Down Expand Up @@ -422,10 +420,10 @@ const convertToJobAttempt = async (framework) => {
};

const convertTaskDetail = async (
taskStatus,
userName,
logPathInfix,
taskRoleName,
taskStatus,
jobAttemptId,
frameworkName,
withoutGetPod,
) => {
// get container gpus
Expand All @@ -434,6 +432,7 @@ const convertTaskDetail = async (
taskStatus.attemptStatus.podName,
);
const completionStatus = taskStatus.attemptStatus.completionStatus;
const taskAttemptId = taskStatus.attemptStatus.id;
return {
taskIndex: taskStatus.index,
taskState: convertState(
Expand All @@ -443,16 +442,17 @@ const convertTaskDetail = async (
containerId: taskStatus.attemptStatus.podUID,
containerIp: taskStatus.attemptStatus.podHostIP,
containerGpus,
containerLog: `http://${taskStatus.attemptStatus.podHostIP}:${process.env.LOG_MANAGER_PORT}/log-manager/tail/${userName}/${logPathInfix}/${taskRoleName}/${taskStatus.attemptStatus.podUID}/`,
containerLog: `/api/v2/jobs/${frameworkName}/attempts/${jobAttemptId}/taskRoles/${taskRoleName}/taskIndex/${taskStatus.index}/attempts/${taskAttemptId}/logs`,
containerExitCode: completionStatus ? completionStatus.code : null,
};
};

const convertTaskAttempt = async (
logPathInfix, // job level info
userName,
frameworkName,
jobAttemptId,
taskRoleName,
taskIndex,
ports,
taskRoleName, // task role level info
attemptState, // attempt level info
attemptStatus,
) => {
Expand Down Expand Up @@ -491,7 +491,7 @@ const convertTaskAttempt = async (
// Job level info
containerPorts,
containerGpus,
containerLog: `http://${attemptStatus.podHostIP}:${process.env.LOG_MANAGER_PORT}/log-manager/tail/${userName}/${logPathInfix}/${taskRoleName}/${attemptStatus.podUID}/`,
containerLog: `/api/v2/jobs/${frameworkName}/attempts/${jobAttemptId}/taskRoles/${taskRoleName}/taskIndex/${taskIndex}/attempts/${attemptStatus.id}/logs`,
containerExitCode: completionStatus ? completionStatus.code : null,
containerExitSpec: completionStatus
? generateExitSpec(completionStatus.code)
Expand Down Expand Up @@ -520,12 +520,13 @@ const convertToTaskDetail = async (
const completionStatus = lastTaskAttemptStatus.completionStatus;
const userName = attemptFramework.metadata.labels.userName;
const jobName = attemptFramework.metadata.annotations.jobName;
const jobAttemptId = attemptFramework.status.attemptStatus.id;

const taskDetail = {
// job level information
username: userName,
jobName: jobName,
jobAttemptId: attemptFramework.status.attemptStatus.id,
jobAttemptId: jobAttemptId,
// task role level information
taskRoleName: taskRoleName,
// task level information
Expand All @@ -546,10 +547,6 @@ const convertToTaskDetail = async (
attempts: [],
};

const logPathInfix = attemptFramework.metadata.annotations.logPathInfix
? attemptFramework.metadata.annotations.logPathInfix
: jobName;

const ports = attemptFramework.spec.taskRoles.find(
(taskRoleSpec) => taskRoleSpec.name === taskRoleName,
).task.pod.metadata.annotations['rest-server/port-scheduling-spec'];
Expand All @@ -558,10 +555,11 @@ const convertToTaskDetail = async (
// last task attempt
taskDetail.attempts.push(
await convertTaskAttempt(
logPathInfix,
userName,
ports,
`${userName}~${jobName}`,
jobAttemptId,
taskRoleName,
taskStatus.index,
ports,
lastTaskAttemptState,
lastTaskAttemptStatus,
),
Expand All @@ -571,10 +569,11 @@ const convertToTaskDetail = async (
for (const taskHistory of taskHistories) {
taskDetail.attempts.push(
await convertTaskAttempt(
logPathInfix,
userName,
ports,
`${userName}~${jobName}`,
jobAttemptId,
taskRoleName,
taskStatus.index,
ports,
taskHistory.status.state,
taskHistory.status.attemptStatus,
),
Expand Down
Loading

0 comments on commit 2fb370a

Please sign in to comment.