function [x, it, time, crit, conv] = train_classifier_binary(dataset, loss, minibatch_size, num_blocks, prior, lambda, algo, opt)

% problem size
L = size(dataset.samples,1);       % num. of samples
N = size(dataset.samples,2);       % num. of features
K = max( unique(dataset.labels) ); % num. of classes (K=1 -> binary)

% binary vs multiclass
labels = dataset.labels;
if K > 1
    labels = one_vs_all(labels);
end

% normalization
if any(strcmpi(algo, {'pd', 'bcpd', 'async', 'pfwd'}))
    normalize = true;
else
    normalize = false;
end

% stochastic methods
if any(strcmpi(algo, {'sfb', 'rda'}))
    skip = true;
else
    skip = false;
end

% block-diagonalization
if ~strcmpi(algo, 'dr2') && ~strcmpi(algo, 'bcdr2')
    num_blocks = 1;
end

% initialization
if rem(N, num_blocks) ~= 0
    error('The number of features must be divisible by the number of blocks')
end
B  = N / num_blocks;
xx = randn([B,K,num_blocks]);

% proximity operators
f = set_prior(prior, lambda);
if num_blocks > 1
    h = set_loss_binary_blkdiag(loss, dataset.samples, labels, minibatch_size, num_blocks, normalize);
else
    h = set_loss_binary(loss, dataset.samples, labels, minibatch_size, normalize, skip);
end

% optimization (for x_inf)
x_inf = 0;
if isfield(opt, 'inf') && opt.inf > 0
    new_opt      = opt;
    new_opt.tol  = 0;
    new_opt.iter = opt.inf;
    x_inf = exec_algo(algo, xx, f, h, new_opt);
end

% optimization
[xx,it,time,crit,conv] = exec_algo(algo, xx, f, h, opt, x_inf);

% inverse reshaping
x = [];
for b = 1:num_blocks
    x = cat(1, x, xx(:,:,b));
end