Split Train test valid

Created	@February 20, 2024
Tags	ML Coding

import numpy as np

def split_dataset(data, labels, train_ratio=0.6, valid_ratio=0.2, test_ratio=0.2):
    """
    Splits a dataset into training, validation, and test sets based on specified ratios.
    
    Parameters:
    - data: numpy array of data.
    - labels: numpy array of labels corresponding to the data.
    - train_ratio: float, the proportion of the dataset to include in the train split.
    - valid_ratio: float, the proportion of the dataset to include in the validation split.
    - test_ratio: float, the proportion of the dataset to include in the test split.
    
    Returns:
    - X_train, y_train: Training data and labels.
    - X_valid, y_valid: Validation data and labels.
    - X_test, y_test: Test data and labels.
    """
    
    # Ensure that the sum of the ratios is 1
    assert train_ratio + valid_ratio + test_ratio == 1
    
    # Shuffle the data and labels in unison
    shuffled_indices = np.random.permutation(len(data))
    data_shuffled = data[shuffled_indices]
    labels_shuffled = labels[shuffled_indices]
    
    # Calculate split indices
    train_end = int(len(data) * train_ratio)
    valid_end = int(len(data) * (train_ratio + valid_ratio))
    
    # Split the data and labels
    X_train = data_shuffled[:train_end]
    y_train = labels_shuffled[:train_end]
    X_valid = data_shuffled[train_end:valid_end]
    y_valid = labels_shuffled[train_end:valid_end]
    X_test = data_shuffled[valid_end:]
    y_test = labels_shuffled[valid_end:]
    
    return X_train, y_train, X_valid, y_valid, X_test, y_test

# Example usage:
# Assuming data and labels are your dataset and labels numpy arrays respectively
# data = np.random.rand(1000, 10)  # Example data (1000 samples, 10 features)
# labels = np.random.randint(2, size=(1000, 1))  # Example binary labels

# X_train, y_train, X_valid, y_valid, X_test, y_test = split_dataset(data, labels)