import numpy as np
def split_dataset(data, labels, train_ratio=0.6, valid_ratio=0.2, test_ratio=0.2):
"""
Splits a dataset into training, validation, and test sets based on specified ratios.
Parameters:
- data: numpy array of data.
- labels: numpy array of labels corresponding to the data.
- train_ratio: float, the proportion of the dataset to include in the train split.
- valid_ratio: float, the proportion of the dataset to include in the validation split.
- test_ratio: float, the proportion of the dataset to include in the test split.
Returns:
- X_train, y_train: Training data and labels.
- X_valid, y_valid: Validation data and labels.
- X_test, y_test: Test data and labels.
"""
# Ensure that the sum of the ratios is 1
assert train_ratio + valid_ratio + test_ratio == 1
# Shuffle the data and labels in unison
shuffled_indices = np.random.permutation(len(data))
data_shuffled = data[shuffled_indices]
labels_shuffled = labels[shuffled_indices]
# Calculate split indices
train_end = int(len(data) * train_ratio)
valid_end = int(len(data) * (train_ratio + valid_ratio))
# Split the data and labels
X_train = data_shuffled[:train_end]
y_train = labels_shuffled[:train_end]
X_valid = data_shuffled[train_end:valid_end]
y_valid = labels_shuffled[train_end:valid_end]
X_test = data_shuffled[valid_end:]
y_test = labels_shuffled[valid_end:]
return X_train, y_train, X_valid, y_valid, X_test, y_test
# Example usage:
# Assuming data and labels are your dataset and labels numpy arrays respectively
# data = np.random.rand(1000, 10) # Example data (1000 samples, 10 features)
# labels = np.random.randint(2, size=(1000, 1)) # Example binary labels
# X_train, y_train, X_valid, y_valid, X_test, y_test = split_dataset(data, labels)