infer.py

import torch, torchaudio
import argparse
from preprocess import process_audio_data, process_image_data
from train import WatermelonModel


def infer(audio, image, model, device):
    # Load and preprocess the input data
    audio, sr = torchaudio.load(audio)
    mfcc = process_audio_data(audio, sr).to(device)
    img = process_image_data(image).to(device)
    if mfcc is None or img is None:
        return None
    
    # Run inference
    with torch.no_grad():
        predicted_sweetness = model(mfcc, img).item()

    return predicted_sweetness


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run Watermelon Sweetness Prediction")
    parser.add_argument(
        "--model_path", type=str, required=True, help="Path to the saved model file"
    )
    parser.add_argument(
        "--audio_path", type=str, required=True, help="Path to audio file"
    )
    parser.add_argument(
        "--image_path", type=str, required=True, help="Path to image file"
    )
    args = parser.parse_args()

    # Initialize the model and device
    print(f"\033[92mINFO\033[0m: PyTorch version: {torch.__version__}")
    device = torch.device(
        "cuda"
        if torch.cuda.is_available()
        else "mps" if torch.backends.mps.is_available() else "cpu"
    )
    print(f"\033[92mINFO\033[0m: Using device: {device}")
    model = WatermelonModel().to(device)
    model.load_state_dict(torch.load(args.model_path, map_location=device))

    # Example paths to audio and image files
    audio_path = args.audio_patb
    image_path = args.image_path

    # Run inference
    sweetness = infer(audio_path, image_path, model, device)
    print(f"Predicted sweetness: {sweetness}")