deepspeed

2025-09-15 14:48:36 +08:00 · 2024-08-07 12:48:02 +08:00 · 2024-08-07 12:48:02 +08:00 · 29fa4e4789
commit 29fa4e4789
parent 73c5196e97
1 changed files with 27 additions and 31 deletions
--- a/funasr/train_utils/load_pretrained_model.py
+++ b/funasr/train_utils/load_pretrained_model.py
@ -35,41 +35,37 @@ def load_pretrained_model(
    logging.info(f"ckpt: {path}, use_deepspeed: {use_deepspeed}")
-    if oss_bucket is None:
+    if use_deepspeed and os.path.isdir(path):
-        if use_deepspeed:
+        ckpt_dir = os.path.dirname(path)
-            ckpt_dir = os.path.dirname(path)
+        ckpt_name = os.path.basename(path)
-            ckpt_name = os.path.basename(path)
+        if os.path.exists(f"{ckpt_dir}/zero_to_fp32.py"):
-            if os.path.exists(f"{ckpt_dir}/zero_to_fp32.py"):
+            print("Detect zero_to_fp32, begin to convert fp32 model")
-                print("Detect zero_to_fp32, begin to convert fp32 model")
+            ckpt_fp32 = f"{ckpt_dir}/{ckpt_name[3:]}"
-                ckpt_fp32 = f"{ckpt_dir}/{ckpt_name[3:]}"
+            if os.path.exists(ckpt_fp32):
-                if os.path.exists(ckpt_fp32):
+                print(f"Detect zero_to_fp32 already exist! Loading it directly. {ckpt_fp32}")
-                    print(f"Detect zero_to_fp32 already exist! Loading it directly. {ckpt_fp32}")
+                src_state = torch.load(ckpt_fp32, map_location=map_location)
                    src_state = torch.load(ckpt_fp32, map_location=map_location)
                else:
                    with open(f"{ckpt_dir}/latest", "w") as latest:
                        latest.write(ckpt_name)
                        latest.flush()
                    from deepspeed.utils.zero_to_fp32 import (
                        get_fp32_state_dict_from_zero_checkpoint,
                    )
                    src_state = get_fp32_state_dict_from_zero_checkpoint(ckpt_dir)  # already on cpu
                    if kwargs.get("save_deepspeed_zero_fp32", False):
                        print(
                            f'save_deepspeed_zero_fp32: {kwargs.get("save_deepspeed_zero_fp32", False)}, {ckpt_fp32}'
                        )
                        torch.save({"state_dict": src_state}, ckpt_fp32)
            else:
-                print("Detect deepspeed without zero, load fp32 model directly")
+                with open(f"{ckpt_dir}/latest", "w") as latest:
-                for item in os.listdir(path):
+                    latest.write(ckpt_name)
-                    if item.endswith(".pt"):
+                    latest.flush()
-                        src_state = torch.load(f"{path}/{item}", map_location=map_location)
+                from deepspeed.utils.zero_to_fp32 import (
                    get_fp32_state_dict_from_zero_checkpoint,
                )
                src_state = get_fp32_state_dict_from_zero_checkpoint(ckpt_dir)  # already on cpu
                if kwargs.get("save_deepspeed_zero_fp32", False):
                    print(
                        f'save_deepspeed_zero_fp32: {kwargs.get("save_deepspeed_zero_fp32", False)}, {ckpt_fp32}'
                    )
                    torch.save({"state_dict": src_state}, ckpt_fp32)
        else:
-            src_state = torch.load(path, map_location=map_location)
+            print("Detect deepspeed without zero, load fp32 model directly")
            for item in os.listdir(path):
                if item.endswith(".pt"):
                    src_state = torch.load(f"{path}/{item}", map_location=map_location)
    else:
-        buffer = BytesIO(oss_bucket.get_object(path).read())
+        src_state = torch.load(path, map_location=map_location)
        src_state = torch.load(buffer, map_location=map_location)
    src_state = src_state["state_dict"] if "state_dict" in src_state else src_state
    src_state = src_state["model_state_dict"] if "model_state_dict" in src_state else src_state