# -------------- Modification 1 -------------- # Modify sam2_video_predictor.py using the following changes: # 1. Insert "import numpy as np" after "import torch.nn.functional as F" line 11. # so that the import section looks like this: import warnings from collections import OrderedDict import torch import torch.nn.functional as F import numpy as np # -------------- Modification 2 -------------- # insert the following function after @torch.inference_mode() def init_state( .... .... ) # Function starts here: @torch.inference_mode() def init_state_from_array( self, image_array, offload_video_to_cpu=False, offload_state_to_cpu=False, async_loading_frames=False, # Included for compatibility, though not used here ): """Initialize an inference state with a NumPy array of images. Args: image_array (np.ndarray): A NumPy array of shape (num_frames, height, width, 3) containing RGB images. offload_video_to_cpu (bool): Whether to offload video frames to CPU memory. offload_state_to_cpu (bool): Whether to offload the inference state to CPU memory. async_loading_frames (bool): Ignored in this version, included for compatibility. Returns: dict: The initialized inference state. """ compute_device = self.device # device of the model, e.g., cuda:0 # Validate input if not isinstance(image_array, np.ndarray): raise ValueError("image_array must be a NumPy array") if image_array.ndim != 4 or image_array.shape[-1] != 3: raise ValueError("image_array must have shape (num_frames, height, width, 3)") num_frames, video_height, video_width = image_array.shape[:3] # Convert NumPy array to torch tensor and preprocess images = torch.from_numpy(image_array).float() # Shape: (N, H, W, C) images = images.permute(0, 3, 1, 2) # Shape: (N, C, H, W) # Move images to compute_device immediately images = images.to(compute_device, non_blocking=True) # Resize images to model’s image_size if video_height != self.image_size or video_width != self.image_size: images = torch.nn.functional.interpolate( images, size=(self.image_size, self.image_size), mode="bilinear", align_corners=False, ) # Normalize images (ensure mean and std are on the same device) img_mean = torch.tensor([0.485, 0.456, 0.406], dtype=torch.float32, device=compute_device)[:, None, None] img_std = torch.tensor([0.229, 0.224, 0.225], dtype=torch.float32, device=compute_device)[:, None, None] images = images / 255.0 # Scale to [0, 1] if not already images -= img_mean images /= img_std # Move to appropriate device based on offload_video_to_cpu if offload_video_to_cpu: images = images.to(torch.device("cpu"), non_blocking=True) # Initialize inference state inference_state = {} inference_state["images"] = images inference_state["num_frames"] = num_frames inference_state["offload_video_to_cpu"] = offload_video_to_cpu inference_state["offload_state_to_cpu"] = offload_state_to_cpu inference_state["video_height"] = video_height inference_state["video_width"] = video_width inference_state["device"] = compute_device if offload_state_to_cpu: inference_state["storage_device"] = torch.device("cpu") else: inference_state["storage_device"] = compute_device # Initialize other state variables consistent with "sam2_video_predictor-fb.py" inference_state["point_inputs_per_obj"] = {} inference_state["mask_inputs_per_obj"] = {} inference_state["cached_features"] = {} inference_state["constants"] = {} inference_state["obj_id_to_idx"] = OrderedDict() inference_state["obj_idx_to_id"] = OrderedDict() inference_state["obj_ids"] = [] inference_state["output_dict_per_obj"] = {} inference_state["temp_output_dict_per_obj"] = {} inference_state["frames_tracked_per_obj"] = {} # Warm up the visual backbone and cache the image feature on frame 0 self._get_image_feature(inference_state, frame_idx=0, batch_size=1) return inference_state