Skip to content

Commit 922f7b5

Browse files
committed
[bugfix] Add deployment delete request to edges when deployment fails
1 parent b68a835 commit 922f7b5

File tree

2 files changed

+6
-2
lines changed

2 files changed

+6
-2
lines changed

python/fedml/computing/scheduler/model_scheduler/master_job_runner.py

+5
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from multiprocessing import Queue
1010

1111
import fedml
12+
from .device_model_msg_object import FedMLModelMsgObject
1213
from fedml.core.mlops import MLOpsRuntimeLog, MLOpsConfigs
1314
from fedml.core.mlops.mlops_runtime_log import MLOpsFormatter
1415
from .device_client_constants import ClientConstants
@@ -274,10 +275,14 @@ def process_deployment_result_message(self, topic=None, payload=None):
274275

275276
# Avoid endless loop, if the rollback also failed, we should report the failure to the MLOps
276277
if self.replica_controller.under_rollback or self.is_fresh_endpoint:
278+
logging.info(f"process deploy result, under_rollback {self.replica_controller.under_rollback}, is_fresh_endpoint {self.is_fresh_endpoint}")
277279
self.send_deployment_status(
278280
end_point_id, end_point_name, payload_json["model_name"], "",
279281
ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED,
280282
message_center=self.message_center)
283+
# when report failed to the MLOps, need to delete the replica has successfully deployed and release the gpu
284+
model_msg_object = FedMLModelMsgObject(topic, payload)
285+
self.send_deployment_delete_request_to_edges(payload, model_msg_object, message_center=self.message_center)
281286
return
282287

283288
# Failure handler, send the rollback message to the worker devices only if it has not been rollback

python/fedml/core/mlops/mlops_device_perfs.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -128,8 +128,7 @@ def setup_realtime_stats_process(self, sys_args):
128128
self.monitor_run_master_process.start()
129129

130130
def report_device_realtime_stats_entry(self, sys_event, role, is_client=False):
131-
logging.info(f"Report device realtime stats, role {role}, is_client {is_client}, process id {os.getpid()}")
132-
131+
# logging.info(f"Report device realtime stats, role {role}, process id {os.getpid()}")
133132
self.device_realtime_stats_event = sys_event
134133
mqtt_mgr = MqttManager(
135134
self.args.mqtt_config_path["BROKER_HOST"],

0 commit comments

Comments
 (0)