function Ju2RoSpnPdTr(modelFile,XTr,YTr,XDev,YDev,SpnBinDir,NG,P,Version)
%Poons and Domingos Algorithm (valid for multioutput SPN)
%PARAM INFORMATION (see below)
%
%%For AvTr: 3970, AvDev:1985 then training time 2316 second
%For AvTr: 271634, AvDev:10377, Not reach first iteration in 15 hours
%For AvTr: 7940, AvDev:5955, (4 and 2)  23040 seconds.
% For Tr and Dev: 67712 and 50790 samples (Global phone multioutput) more than 14 hours (not reached end)
%For Tr and Dev: 18120 and 13595 samples (Global phone multioutput) more than 14 hours (not finished, reached iteration 17)
%Tr and Dev: 9925 and 3928 samples (23x2, Global phone multioutput) 3654.460010 seconds.
%Tr and Dev: 9925 and 7543 samples (40x1, Global post multioutput) 26873.964811 seconds.
%On 132MultO Tr: 660samp 213sec, 1188samp 701sec, 2244samp 2208sec, 13332samp 41234sec(11h)
%------------------------------------------------------------------------
%Tr and Dev (40x1Fmllr,MultO132): 10132 and 3132 samples,  26890.694971 seconds (7 h)
%Tr and Dev (40x1Fmllr,MultO1985): 11985 and 4985 samples,  28388.114720 seconds. seconds (7.8 h)
% Tr and Dev (40x1Fmllr,MultO1985): 21985 and 7985 samples, I estimate aprox 21h
% Tr and Dev (40x1Fmllr,MultO1985,batchS=256): 11985 and 4984 samples 85876 (24h)
%Tr and Dev (40x1Fmllr,MultO1985,batchS=10): 11985 and 4984 samples,  37036 (10 h)
%
Bin_Path = [SpnBinDir '/'];
%Model_Path=[SpnModDir '/'];

%
numGauss = NG;
numSums = 20;
batchSize = P.ExOpt.BatchS; %10;
sparsePrior = 1;
sparsePriorAnneal = 10;
coarseRes = [4];
coarseResX = [];
coarseResY = [];

%
L1 = P.ExOpt.L1; %40
L2 = P.ExOpt.L2; %1
%numIterPD = 20;
numIterPD = P.ExOpt.numIterPD;

%
earlyStoppingKrel = 3;
earlyStoppingKabs = 5;

% Prepare data for PD learning
TrainDataPD = XTr';
DevDataPD = XDev';

%
LUYTr=length(unique(YTr(:))); %Length of Unique YTr
LABELS{1} = YTr;
LABELS{2} = YDev;

% subplot(211), jimagesc(XTr);
% subplot(212), jplot(YTr);
% pause


%modelFile = sprintf('%sSPN_G%d_S%d_B%d_sP%d.mod', Model_Path, numGauss, numSums, batchSize, sparsePrior);
TrNf=size(TrainDataPD,1);
DevNf=size(DevDataPD,1);

tic

fprintf('Tr and Dev: %d and %d samples\n',TrNf,DevNf);

%V2 input accept outputLabels
%The problem must be in CPP because changing Matlabv1 and Matlabv2 (or ThisFun and ItsOldVersion)
%produces the same result
switch Version
    
    
    case 1        
        if LUYTr>1
            error('SPN version 1 not designed for MultO')
        else
             warning('Changing to SPN version 1')
        end
        Bin_Path = strrep(Bin_Path, '/SPNv2/', '/SPNv1/');        
        trainSPN_PD(TrainDataPD, L1, L2, ...
            'valData', DevDataPD, ...            
            'binPath', Bin_Path,...
            'numGauss', numGauss, ...
            'numIter', numIterPD, ...
            'modelFile', modelFile,...
            'stop_absLikelihoodChange', -1e20,...
            'earlyStoppingKrel', earlyStoppingKrel,...
            'earlyStoppingKabs', earlyStoppingKabs,...
            'coarseRes', coarseRes, ...
            'coarseResX', coarseResX, ...
            'coarseResY', coarseResY, ...
            'batchSize', batchSize);
        
        
    case 2       
        trainSPN_PD(TrainDataPD, L1, L2, ...
            'valData', DevDataPD, ...
            'outputLabels', LABELS,...
            'binPath', Bin_Path,...
            'numGauss', numGauss, ...
            'numIter', numIterPD, ...
            'modelFile', modelFile,...
            'stop_absLikelihoodChange', -1e20,...
            'earlyStoppingKrel', earlyStoppingKrel,...
            'earlyStoppingKabs', earlyStoppingKabs,...
            'coarseRes', coarseRes, ...
            'coarseResX', coarseResX, ...
            'coarseResY', coarseResY, ...
            'batchSize', batchSize);
        
end

toc

PrintSpnLosF(XTr,YTr,XDev,YDev,modelFile,'PD');


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%PARAM INFORMATION
%
%dimX = 40; If some 1D then both the same: dimX Y=[1 40] = [40 1]
%
%numGauss = 7;
%
%numSums = 20; The number of sum nodes per rectangle. The PD algorithm recursively divides the image into sub-rectangles. 
%Each sub-rectangle is represented by a dictionary of sum nodes (i.e. mixture distributions). 
%As numGauss, this is also ideally be set by cross-validation.
%
%batchSize = 10; %batch size of incremental EM. Using incremental EM, it is possible to immediately update the weights after 
%seeing each batch (can also be a _single_ sample). This will have some effect on the training, but 
%again hard to say which batchSize works best.
%The batch size in the P&D algo is the batch size of incremental EM, see:
%http://www.cs.toronto.edu/~fritz/absps/emk.pdf
%In a nutshell, the idea of incremental EM is as follows:
%Given a parametric probabilistic model with latent variables (e.g. GMM), standard EM iterates over all training samples and uses 
%the expected sufficient statistics to update the parameters.
%The idea is to use the statistics of the sample immediately, and not to wait until the full pass over training data is completed. 
%The batch size of incremental EM is the number of samples which are processed  for each update. It can generally be expected, that smaller 
%batch sizes in incremental EM converge more quickly. You could even use batch-size 1. Maybe this would be worth a try? 
%
%sparsePrior = 1; the actual structure defined by Poon and Domingos is HUGE. So, they actually virtualize the 
%network and generate a sparse sub-structure on the fly. sparsePrior is the price (measured in 'nats', natural unit of information ) 
%you have to pay to generate a new node. Thus, the higher the sparser the SPN.
%
%sparsePriorAnneal = 10; PD don't start with the full sparse prior, but let it grow linearly over
%sparsePriorAnneal many iterations. Thus, it is cheaper to generate new nodes in the beginning of the training.
%
% coarseRes = [4]; %v1 4
%coarseResX = []; %v1 1
%coarseResY = []; %v1 4
%The Poon and Domingos code splits the image (in your case the time-frequency representation) recursevely into sub-rectangles. 
%In principle this happens by splitting a rectangle in all possible way into two smaller rectangles, using a stepsize of 1. 
%Since the generated structure would then be huge, they take a larger stepsize, i.e. they consider fewer splits. This is controlled 
%by the coarseRes parameters: when coarseRes = [4], then rectangles which are striclty larger than the 4x4 square are split using
%a stepsize of 4. coarseRes is a vector, e.g. coarseRes = [4,8,16] would mean that rectangles which are larger than the 16x16 
%square are split using stepsize 16, rectangles which are larger than the 8x8 square are split using stepsize 8, ect.
% When both coarseResX and coarseResY are non-empty, then the stepsizes are determined independently for two two dimensions -- so far, 
%this variant seemed to work not so well, so better don't use them.
% In lines 294-329 in PDLearning.cpp you can see how the stepsizes are determined.
% Long story short: If you use e.g. coarseRes = [4,8,16], then the structures become smaller. so far, however, coarseRes = [4] usually worked best.
%
%numIter = 20; number of EM iterations. I think you can use far less iterations -- 
%for bandwidth extension 3 (!) iterations where good. Incremental EM converges pretty fast.
%
%earlyStoppingKrel = 3;
%earlyStoppingKabs = 5; number of times the dev-likelihood is allowed to decrease, before finishing the training 
%(the best network on the development set is recorded).%earlyStoppingKabs = 5: number of times the dev-likelihood is 
%allowed to be worse than the best dev-likelihood seen so far.





