This code box contains updated code from my previous post. The main change is the inclusion of bias units for the directed auto-regressive weights and the visible to hidden weights. In addition there is code showing how data is pre-processed into batches/targets for the pre-training and code showing how the weight matrices are manipulated into a form suitable for my existing optimised crbm code for gaussian units.
%  select rolling window length to use - an optimisable parameter via pso?
rolling_window_length = 50 ;
%  how-many timesteps do we look back for directed connections - this is what we call the "order" of the model 
n1 = 3 ; % first "gaussian" layer order
n2 = 3 ; % second "binary" layer order
batchsize = 5 ;
%  taking into account rolling_window_length, n1, n2 and batchsize, get total lookback length
remainder = rem( ( rolling_window_length + n1 + n2 ) , batchsize ) ;
if ( remainder > 0 ) % number of training examples with lookback and orders n1 and n2 not exactly divisable by batchsize
lookback_length = ( rolling_window_length + n1 + n2 + ( batchsize - remainder ) ) ; % increase the lookback_length
else                 % number of training examples with lookback and orders n1 and n2 exactly divisable by batchsize
lookback_length = ( rolling_window_length + n1 + n2 ) ;
end
%  create batchdataindex using lookback_length to index bars in the features matrix
batchdataindex = ( ( training_point_index - ( lookback_length - 1 ) ) : 1 : training_point_index )' ;
batchdata = features( batchdataindex , : ) ;
%  z-normalise the batchdata matrix with the mean and std of columns 
data_mean = mean( batchdata , 1 ) ;
data_std = std( batchdata , 1 ) ;
batchdata = ( batchdata .- repmat( data_mean , size( batchdata , 1 ) , 1 ) ) ./ repmat( data_std , size( batchdata , 1 ) , 1 ) ; % batchdata is now z-normalised by data_mean & data_std
% add bias neurons
batchdata = [ ones( size( batchdata , 1 ) , 1 ) batchdata ] ;
%  create the minibatch index matrix for gaussian rbm pre-training of directed weights w
minibatch = ( 1 : 1 : size( batchdata , 1 ) ) ; minibatch( 1 : ( size( batchdata , 1 ) - rolling_window_length ) ) = [] ;
minibatch = minibatch( randperm( size( minibatch , 2 ) ) ) ; minibatch = reshape( minibatch , batchsize , size( minibatch , 2 ) / batchsize ) ; 
% PRE-TRAINING FOR THE VISABLE TO HIDDEN AND THE VISIBLE TO VISIBLE WEIGHTS %%%%
% First create a training set and target set for the pre-training
dAuto_Encode_targets = batchdata( : , 2 : end ) ; dAuto_Encode_training_data = [] ;
  
  % loop to create the dAuto_Encode_training_data ( n1 == "order" of the gaussian layer of crbm )
  for ii = 1 : n1
  dAuto_Encode_training_data = [ dAuto_Encode_training_data shift( batchdata , ii ) ] ;
  end
% now delete the first n1 rows due to circular shift induced mismatch of data and targets
dAuto_Encode_targets( 1 : n1 , : ) = [] ; dAuto_Encode_training_data( 1 : n1 , : ) = [] ;
% add bias
%dAuto_Encode_training_data = [ ones( size( dAuto_Encode_training_data , 1 ) , 1 ) dAuto_Encode_training_data ] ; 
% bias units idx
bias_idx = ( 1 : size( batchdata , 2 ) : size( dAuto_Encode_training_data , 2 ) ) ;
% DO RBM PRE-TRAINING FOR THE BOTTOM UP DIRECTED WEIGHTS W %%%%%%%%%%%%%%%%%%%%%
% use rbm trained initial weights instead of using random initialisation for weights
% Doing this because we are not using regularisation in the autoencoder pre-training
epochs = 10000 ;
hidden_layer_size = 2 * size( dAuto_Encode_targets , 2 ) ;
w_weights = gaussian_rbm( dAuto_Encode_targets , minibatch , epochs , hidden_layer_size ) ;
% keep a copy of these original w_weights
w1 = w_weights ;
A_weights = gaussian_rbm( dAuto_Encode_training_data , minibatch , epochs , size( dAuto_Encode_targets , 2 ) ) ;
B_weights = gaussian_rbm( dAuto_Encode_training_data , minibatch , epochs , hidden_layer_size ) ;
% create weight update matrices
A_weights_update = zeros( size( A_weights ) ) ;
B_weights_update = zeros( size( B_weights ) ) ;
w_weights_update = zeros( size( w_weights ) ) ;
% for adagrad
historical_A = zeros( size( A_weights ) ) ;
historical_B = zeros( size( B_weights ) ) ;
historical_w = zeros( size( w_weights ) ) ;
% set some training parameters
n = size( dAuto_Encode_training_data , 1 ) ; % number of training examples in dAuto_Encode_training_data
input_layer_size = size( dAuto_Encode_training_data , 2 ) ;
fudge_factor = 1e-6 ; % for numerical stability for adagrad
learning_rate = 0.1 ; % will be changed to 0.01 after 50 iters through epoch loop
mom = 0 ;             % will be changed to 0.9 after 50 iters through epoch loop
noise = 0.5 ;
epochs = 1000 ;
cost = zeros( epochs , 1 ) ;
lowest_cost = inf ;
  % Stochastic Gradient Descent training over dAuto_Encode_training_data 
  for iter = 1 : epochs
   
      % change momentum and learning_rate after 50 iters
      if iter == 50
      mom = 0.9 ;
      learning_rate = 0.01 ;
      end
  
      index = randperm( n ) ; % randomise the order of training examples
     
      for training_example = 1 : n
      
      % Select data for this training batch
      tmp_X = dAuto_Encode_training_data( index( training_example ) , : ) ;
      tmp_T = dAuto_Encode_targets( index( training_example ) , : ) ;
      
      % Randomly black out some of the input training data
      tmp_X( rand( size( tmp_X ) ) < noise ) = 0 ;
      % but keep bias units
      tmp_X( bias_idx ) = 1 ;
      
      % feedforward tmp_X through B_weights and get sigmoid e.g ret = 1.0 ./ ( 1.0 + exp(-input) )
      tmp_X_through_sigmoid = 1.0 ./ ( 1.0 .+ exp( - B_weights * tmp_X' ) ) ;
      
      % Randomly black out some of tmp_X_through_sigmoid for dropout training
      tmp_X_through_sigmoid( rand( size( tmp_X_through_sigmoid ) ) < noise ) = 0 ;
    
      % feedforward tmp_X through A_weights and add to tmp_X_through_sigmoid * w_weights for linear output layer
      final_output_layer = ( tmp_X * A_weights' ) .+ ( tmp_X_through_sigmoid' * w_weights ) ;
    
      % now do backpropagation
      % this is the derivative of weights for the linear final_output_layer
      delta_out = ( tmp_T - final_output_layer ) ;
      
      % NOTE! gradient of sigmoid function g = sigmoid(z) .* ( 1.0 .- sigmoid(z) )
      sig_grad = tmp_X_through_sigmoid .* ( 1 .- tmp_X_through_sigmoid ) ; 
      
      % backpropagation only through the w_weights that are connected to tmp_X_through_sigmoid
      delta_hidden = ( w_weights * delta_out' ) .* sig_grad ;
      
      % apply deltas from backpropagation with adagrad for the weight updates
      historical_A = historical_A .+ ( delta_out' * tmp_X ).^2 ;    
      A_weights_update = mom .* A_weights_update .+ ( learning_rate .* ( delta_out' * tmp_X ) ) ./ ( fudge_factor .+ sqrt( historical_A ) ) ;
      
      historical_w = historical_w .+ ( tmp_X_through_sigmoid * delta_out ).^2 ;
      w_weights_update = mom .* w_weights_update .+ ( learning_rate .* ( tmp_X_through_sigmoid * delta_out ) ) ./ ( fudge_factor .+ sqrt( historical_w ) ) ;
      
      historical_B = historical_B .+ ( delta_hidden * tmp_X ).^2 ;
      B_weights_update = mom .* B_weights_update .+ ( learning_rate .* ( delta_hidden * tmp_X ) ) ./ ( fudge_factor .+ sqrt( historical_B ) ) ;
      
      % update the weight matrices with weight_updates
      A_weights = A_weights + A_weights_update ;
      B_weights = B_weights + B_weights_update ;
      w_weights = w_weights + w_weights_update ;
      
      end % end of training_example loop
  
  % feedforward with this epoch's updated weights
  epoch_trained_tmp_X_through_sigmoid = ( 1.0 ./ ( 1.0 .+ exp( -( ( B_weights ) * dAuto_Encode_training_data' ) ) ) ) ;
  epoch_trained_output = ( dAuto_Encode_training_data * ( A_weights )' ) .+ ( epoch_trained_tmp_X_through_sigmoid' * ( w_weights ) ) ;
 
  % get sum squared error cost
  cost( iter , 1 ) = sum( sum( ( dAuto_Encode_targets .- epoch_trained_output ) .^ 2 ) ) ;
  
    % record best so far
    if cost( iter , 1 ) <= lowest_cost
       lowest_cost = cost( iter , 1 ) ;
       iter_min = iter ;
       best_A = A_weights ;
       best_B = B_weights ;
       best_w = w_weights ;
    end
  
  end % end of backpropagation loop
lowest_cost                                        % print final cost to terminal
iter_min ;                                           % and the iter it occured on
graphics_toolkit( "qt" ) ;
figure(1) ; plot( cost , 'r' , 'linewidth' , 3 ) ; % and plot the cost curve
% plot weights
graphics_toolkit( "gnuplot" ) ;
figure(2) ; surf( best_A ) ; title( 'Best A Weights' ) ;
figure(3) ; surf( best_B ) ; title( 'Best B Weights' ) ;
figure(4) ; surf( best_w ) ; title( 'Best w Weights' ) ;
% END OF CRBM WEIGHT PRE-TRAINING %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% extract bias weights from best_A and best_B
A_bias = best_A( : , bias_idx ) ; best_A( : , bias_idx ) = [] ; A_bias = sum( A_bias , 2 ) ; 
B_bias = best_B( : , bias_idx ) ; best_B( : , bias_idx ) = [] ; B_bias = sum( B_bias , 2 ) ;
% now delete bias units from batchdata
batchdata( : , 1 ) = [] ;
% create reshaped structures to hold A_weights and B_weights
A1 = reshape( best_A , size( best_A , 1 ) , size( best_A , 2 ) / n1 , n1 ) ;
B1 = reshape( best_B , size( best_B , 1 ) , size( best_B , 2 ) / n1 , n1 ) ;The following video shows the evolution of the weights whilst training over 20 consecutive price bars. The top three panes are the weights after the denoising autoencoder training and the bottom three are the same weights after being used as initialisation weights for the CRBM training and then being modified by this CRBM training. It is this final set of weights that would typically be used for CRBM generation.
