Skip to content

Commit 71869c5

Browse files
committed
add nodeagent.yaml
1 parent e46771d commit 71869c5

File tree

6 files changed

+195
-7
lines changed

6 files changed

+195
-7
lines changed

inferxlib/src/obj_mgr/cidrlock.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
use serde::{Deserialize, Serialize};
2+
3+
use crate::resource::NodeResources;
4+
5+
use crate::data_obj::*;
6+
7+
#[derive(Serialize, Deserialize, Debug, Clone, Default)]
8+
pub struct CidrlockSpec {}

inferxlib/src/obj_mgr/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15+
pub mod cidrlock;
1516
pub mod func_mgr;
1617
pub mod funcsnapshot_mgr;
1718
pub mod namespace_mgr;

inferxlib/src/resource.rs

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ pub const MAX_GPU_COUNT: usize = 8;
2525

2626
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Hash, Clone, PartialOrd, Ord)]
2727

28-
pub struct GPUType(String);
28+
pub struct GPUType(pub String);
2929

3030
impl Default for GPUType {
3131
fn default() -> Self {
@@ -38,6 +38,10 @@ impl GPUType {
3838
return Self("Any".to_string());
3939
}
4040

41+
pub fn Unknown() -> Self {
42+
return Self("Unknown".to_string());
43+
}
44+
4145
pub fn CanAlloc(&self, req: &Self) -> bool {
4246
if &req.0 == "Any" {
4347
return true;
@@ -134,12 +138,10 @@ pub struct ResourceConfig {
134138
pub cpu: u64, // 1/1000 CPU cores
135139
#[serde(rename = "Mem", default)]
136140
pub memory: u64, // MB memory
137-
#[serde(rename = "GPUType", default)]
138-
pub gpuType: GPUType,
141+
#[serde(rename = "Mem_2MB", default)]
142+
pub memory2MB: u64, // MB memory
139143
#[serde(rename = "GPUs", default)]
140144
pub gpus: GPUSet,
141-
#[serde(rename = "vRam", default)]
142-
pub vRam: u64, // MB vRam per GPU
143145

144146
#[serde(rename = "ContextOverhead")]
145147
pub contextOverhead: u64, // MB vRam per GPU

k8s/nodeagent.yaml

Lines changed: 161 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,146 @@
11
apiVersion: apps/v1
22
kind: DaemonSet
33
metadata:
4-
name: nodeagent
4+
name: nodeagent-blob
5+
labels:
6+
app: nodeagent
7+
spec:
8+
selector:
9+
matchLabels:
10+
app: nodeagent
11+
template:
12+
metadata:
13+
labels:
14+
app: nodeagent
15+
spec:
16+
nodeSelector:
17+
inferx_nodeType: inferx_blob
18+
hostPID: true
19+
initContainers:
20+
- name: wait-for-dependencies
21+
image: busybox
22+
command:
23+
[
24+
"sh", "-c",
25+
"until nc -z etcd.default.svc.cluster.local 2379 && \
26+
nc -z keycloak.default.svc.cluster.local 8080; do \
27+
echo 'Waiting for dependencies...'; sleep 3; \
28+
done"
29+
]
30+
containers:
31+
- name: nodeagent
32+
image: inferx/inferx_one:v0.1.1
33+
securityContext:
34+
privileged: true
35+
capabilities:
36+
add: ["SYS_ADMIN", "IPC_LOCK", "SYS_RAWIO"]
37+
runAsUser: 0
38+
runAsGroup: 0
39+
env:
40+
- name: STATESVC_ADDR
41+
value: "http://statesvc:1237"
42+
- name: RUN_SERVICE
43+
value: "NodeAgent"
44+
- name: POD_IP
45+
valueFrom:
46+
fieldRef:
47+
fieldPath: status.podIP
48+
- name: NODE_NAME
49+
valueFrom:
50+
fieldRef:
51+
fieldPath: spec.nodeName
52+
- name: ALLOC_MEMORY
53+
valueFrom:
54+
resourceFieldRef:
55+
containerName: nodeagent
56+
resource: requests.memory
57+
- name: ALLOC_MEMORY_2M
58+
valueFrom:
59+
resourceFieldRef:
60+
containerName: nodeagent
61+
resource: requests.hugepages-2Mi
62+
- name: ALLOC_CPU
63+
valueFrom:
64+
resourceFieldRef:
65+
containerName: nodeagent
66+
resource: requests.cpu
67+
resources:
68+
requests:
69+
cpu: "20"
70+
memory: "120Gi" # Regular memory request (RAM)
71+
hugepages-2Mi: "60Gi" # HugePages request
72+
limits:
73+
cpu: "20"
74+
memory: "120Gi" # Regular memory request (RAM)
75+
hugepages-2Mi: "60Gi"
76+
volumeMounts:
77+
- mountPath: /dev/vfio
78+
name: dev-vfio
79+
- mountPath: /var/run/docker.sock
80+
name: docker-sock
81+
- mountPath: /run/udev
82+
name: run-udev
83+
- mountPath: /dev/hugepages
84+
name: dev-hugepages
85+
mountPropagation: Bidirectional
86+
- mountPath: /opt/inferx/
87+
name: opt-inferx
88+
- mountPath: /etc/letsencrypt/
89+
name: letsencrypt
90+
- mountPath: /var/run/docker/runtime-runc/moby/
91+
name: docker-runtime
92+
- mountPath: /var/lib/docker/
93+
name: docker-lib
94+
- mountPath: /sys/bus/pci/devices
95+
name: pci-devices
96+
readOnly: false
97+
- mountPath: /sys/class/uio
98+
name: uio
99+
readOnly: false
100+
- mountPath: /sys/kernel/mm/hugepages
101+
name: hugepages-sys
102+
readOnly: false
103+
command: ["./onenode", "/opt/inferx/config/node.json"]
104+
volumes:
105+
- name: dev-vfio
106+
hostPath:
107+
path: /dev/vfio
108+
- name: pci-devices
109+
hostPath:
110+
path: /sys/bus/pci/devices
111+
- name: uio
112+
hostPath:
113+
path: /sys/class/uio
114+
- name: hugepages-sys
115+
hostPath:
116+
path: /sys/kernel/mm/hugepages
117+
- name: docker-sock
118+
hostPath:
119+
path: /var/run/docker.sock
120+
- name: run-udev
121+
hostPath:
122+
path: /run/udev
123+
- name: dev-hugepages
124+
hostPath:
125+
path: /dev/hugepages
126+
- name: opt-inferx
127+
hostPath:
128+
path: /opt/inferx/
129+
- name: letsencrypt
130+
hostPath:
131+
path: /etc/letsencrypt/
132+
- name: docker-runtime
133+
hostPath:
134+
path: /var/run/docker/runtime-runc/moby/
135+
- name: docker-lib
136+
hostPath:
137+
path: /var/lib/docker/
138+
restartPolicy: Always
139+
---
140+
apiVersion: apps/v1
141+
kind: DaemonSet
142+
metadata:
143+
name: nodeagent-file
5144
labels:
6145
app: nodeagent
7146
spec:
@@ -13,6 +152,8 @@ spec:
13152
labels:
14153
app: nodeagent
15154
spec:
155+
nodeSelector:
156+
inferx_nodeType: inferx_file
16157
hostPID: true
17158
initContainers:
18159
- name: wait-for-dependencies
@@ -43,6 +184,25 @@ spec:
43184
valueFrom:
44185
fieldRef:
45186
fieldPath: status.podIP
187+
- name: NODE_NAME
188+
valueFrom:
189+
fieldRef:
190+
fieldPath: spec.nodeName
191+
- name: MEMORY_REQUEST
192+
valueFrom:
193+
resourceFieldRef:
194+
containerName: nodeagent
195+
resource: requests.memory
196+
- name: MEMORY_2M_REQUEST
197+
valueFrom:
198+
resourceFieldRef:
199+
containerName: nodeagent
200+
resource: requests.hugepages-2Mi
201+
resources:
202+
requests:
203+
memory: "30Gi" # Regular memory request (RAM)
204+
limits:
205+
memory: "30Gi" # Regular memory request (RAM)
46206
volumeMounts:
47207
- mountPath: /dev/vfio
48208
name: dev-vfio

nodeconfig/node.json

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
"resources": {
2323
"CPU": 30000,
2424
"Mem": 400000,
25-
"GPUType": "A4000",
2625
"GPUs": "Auto",
2726
"ContextOverhead": 450,
2827
"MaxContextPerGPU": 1

script/inferx_clean.sh

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#!/bin/bash
2+
3+
PARENT_DIR="/opt/inferx/sandbox/"
4+
INFERX_BIN="/opt/inferx/bin/inferx"
5+
6+
# pkill -9 inferx
7+
8+
for SUBDIR in "$PARENT_DIR"/*; do
9+
if [ -d "$SUBDIR" ]; then
10+
SUBFOLDER_NAME=$(basename "$SUBDIR")
11+
echo "Running inferx on: $SUBFOLDER_NAME"
12+
"$INFERX_BIN" \
13+
--root "/var/run/docker/runtime-runc/moby" \
14+
--log-format json \
15+
--systemd-cgroup delete "$SUBFOLDER_NAME"
16+
17+
fi
18+
done

0 commit comments

Comments
 (0)