blob: 18c92f3c72a20a65958aba25df3e88f7f363803e [file] [log] [blame]
{
"apiVersion": "kubeflow.org/v1",
"kind": "PyTorchJob",
"metadata": {
"creationTimestamp": "2022-07-24T17:54:23.000+08:00",
"generation": 1,
"labels": {
"submarine-experiment-name": "pytorch-dist-mnist"
},
"name": "experiment-1658656463509-0001",
"namespace": "default",
"resourceVersion": "26841",
"uid": "b95d6769-26ff-469c-a346-d5399f543f39"
},
"spec": {
"pytorchReplicaSpecs": {
"Master": {
"replicas": 1,
"template": {
"metadata": {
"annotations": {
"sidecar.istio.io/inject": "false"
}
},
"spec": {
"containers": [
{
"command": [
"python",
"/var/mnist.py",
"--backend",
"gloo"
],
"env": [
{
"name": "ENV_1",
"value": "ENV1"
}
],
"image": "apache/submarine:pytorch-dist-mnist-1.0",
"name": "pytorch",
"resources": {
"limits": {
"cpu": "2",
"memory": "4096M"
},
"requests": {
"cpu": "2",
"memory": "2048M"
}
},
"volumeMounts": [
{
"mountPath": "/logs",
"name": "volume",
"subPath": "submarine-tensorboard/pytorch-dist-mnist"
}
]
}
],
"volumes": [
{
"name": "volume",
"persistentVolumeClaim": {
"claimName": "submarine-tensorboard-pvc"
}
}
]
}
},
"restartPolicy": "OnFailure"
},
"Worker": {
"replicas": 2,
"template": {
"metadata": {
"annotations": {
"sidecar.istio.io/inject": "false"
}
},
"spec": {
"containers": [
{
"command": [
"python",
"/var/mnist.py",
"--backend",
"gloo"
],
"env": [
{
"name": "ENV_1",
"value": "ENV1"
}
],
"image": "apache/submarine:pytorch-dist-mnist-1.0",
"name": "pytorch",
"resources": {
"limits": {
"cpu": "1",
"memory": "2048M"
},
"requests": {
"cpu": "1",
"memory": "1024M"
}
},
"volumeMounts": [
{
"mountPath": "/logs",
"name": "volume",
"subPath": "submarine-tensorboard/pytorch-dist-mnist"
}
]
}
],
"volumes": [
{
"name": "volume",
"persistentVolumeClaim": {
"claimName": "submarine-tensorboard-pvc"
}
}
]
}
},
"restartPolicy": "OnFailure"
}
},
"backoffLimit": 3
},
"status": {
"conditions": [
{
"lastTransitionTime": "2022-07-24T11:23:25Z",
"lastUpdateTime": "2022-07-24T11:23:25Z",
"message": "PyTorchJob experiment-1658656463509-0001 is created.",
"reason": "PyTorchJobCreated",
"status": "True",
"type": "Created"
},
{
"lastTransitionTime": "2022-07-24T11:23:26Z",
"lastUpdateTime": "2022-07-24T11:23:26Z",
"message": "PyTorchJob experiment-1658656463509-0001 is running.",
"reason": "PyTorchJobRunning",
"status": "True",
"type": "Running"
}
],
"replicaStatuses": {
"Master": {
"active": 1
},
"Worker": {
"active": 1
}
}
}
}