Source code for medacy.model.stratified_k_fold

"""
Partitions a data set of sequence labels and classifications into 10 stratified folds.
See Dietterich, 1997 "Approximate Statistical Tests for Comparing Supervised Classification
Algorithms" for in-depth analysis.

Each partition should have an evenly distributed representation of sequence labels.
Without stratification, under-representated labels may not appear in some folds.
"""
import numpy as np
from itertools import cycle

[docs]class SequenceStratifiedKFold: """ """ def __init__(self, folds = 10): self.folds = folds def __call__(self, X, y): """ Returns an iterable [(X*,y*), ...] where each element contains the indices of the train and test set for the particular testing fold. :param X: a collection of sequences :param y: a collection of sequence labels :return: """ # labels are ordered by most examples in data labels = np.unique([label for sequence in y for label in sequence]) np.flip(labels) added = np.ones(len(y), dtype=bool) partitions = [[] for fold in range(self.folds)] partition_cycler = cycle(partitions) for label in labels: possible_sequences = [index for index, sequence in enumerate(y) if label in sequence] for index in possible_sequences: if added[index]: partition = next(partition_cycler) partition.append(index) added[index] = 0 train_test_array = [] for i, _ in enumerate(partitions): y = partitions[i] X = [] for j, partition in enumerate(partitions): if i != j: X += partition # print(X) train_test_array.append((X,y)) return train_test_array