no action applied to my step function in DDPG

3 ビュー (過去 30 日間)
ali kadkhodaei
ali kadkhodaei 2023 年 1 月 16 日
i dfine a dynamic env by rlFunctionEnv . It includes myResetFunction and myResetFunction
when i set agentOpts.NoiseOptions.Variance = 0 then i see that action argument in myStepFunction is equal to zero
and this means only noise affected to train model and no action apply to stepfunction.
i use disp(action) in step function and i see its only show zero and when i set agentOpts.NoiseOptions.Variance = 0.6 i see action non zero
On the other hand, i set 'SaveExperienceBufferWithAgent',true and 'ResetExperienceBufferBeforeTraining',false
but i can not access data stored in ExperienceBuffer???
My code is:
Ts = 0.05;Tf = 40;
obsInfo = rlNumericSpec([6 1]);
obsInfo.Name = 'Robot States';
obsInfo.Description = 'Theta';
obsInfo.UpperLimit =theta_max;
obsInfo.LowerLimit =theta_min;
actInfo = rlNumericSpec([6 1]);
actInfo.Name = 'Robot Action';
actInfo.Description ='Deta Theta';
actInfo.UpperLimit =Ts*thetaDot_max;
actInfo.LowerLimit =Ts*thetaDot_min;
numObs=obsInfo.Dimension(1) ;
numAcs=actInfo.Dimension(1) ;
ResetHandle = @()myResetFunction();
StepHandle = @(DeltaTheta,LoggedSignals) myStepFunction(DeltaTheta,LoggedSignals);
env = rlFunctionEnv(obsInfo,actInfo,StepHandle,ResetHandle);
rng(0)
statePath = [
featureInputLayer(numObs,'Normalization','none','Name','observation')
fullyConnectedLayer(400,'Name','CriticStateFC1')
reluLayer('Name', 'CriticRelu1')
fullyConnectedLayer(300,'Name','CriticStateFC2')];
actionPath = [
featureInputLayer(numAcs,'Normalization','none','Name','action')
fullyConnectedLayer(300,'Name','CriticActionFC1','BiasLearnRateFactor',0)];
commonPath = [
additionLayer(2,'Name','add')
reluLayer('Name','CriticCommonRelu')
fullyConnectedLayer(1,'Name','CriticOutput')];
criticOpts = rlRepresentationOptions('LearnRate',1e-03,'GradientThreshold',1,'Optimizer',"adam",...
'L2RegularizationFactor',0.0001,'UseDevice',"gpu");
criticNetwork = layerGraph();
criticNetwork = addLayers(criticNetwork,statePath);
criticNetwork = addLayers(criticNetwork,actionPath);
criticNetwork = addLayers(criticNetwork,commonPath);
criticNetwork = connectLayers(criticNetwork,'CriticStateFC2','add/in1');
criticNetwork = connectLayers(criticNetwork,'CriticActionFC1','add/in2');
figure
plot(criticNetwork)
critic = rlQValueRepresentation(criticNetwork,obsInfo,actInfo,'Observation',{'observation'},'Action',{'action'},criticOpts);
actorNetwork = [
featureInputLayer(numObs,'Normalization','none','Name','observation')
fullyConnectedLayer(400,'Name','ActorFC1')
reluLayer('Name','ActorRelu1')
fullyConnectedLayer(300,'Name','ActorFC2')
reluLayer('Name','ActorRelu2')
fullyConnectedLayer(200,'Name','ActorFC3')
reluLayer('Name','ActorRelu3')
fullyConnectedLayer(100,'Name','ActorFC4')
reluLayer('Name','ActorRelu4')
fullyConnectedLayer(6,'Name','ActorFC5')
tanhLayer('Name','ActorTanh')
scalingLayer('Name','ActorScaling','Scale',actInfo.UpperLimit)];
actorOpts = rlRepresentationOptions('LearnRate',1e-04,'GradientThreshold',1,'Optimizer',"adam",...
'L2RegularizationFactor',0.0001,'UseDevice',"gpu");
actor = rlDeterministicActorRepresentation(actorNetwork,obsInfo,actInfo,'Observation',{'observation'},'Action',{'ActorScaling'},actorOpts);
% act = cell2mat(getAction(actor,{rand(6,1)}))
agentOpts = rlDDPGAgentOptions(...
'SampleTime',Ts,...
'TargetSmoothFactor',1e-3,...
'ExperienceBufferLength',1e6,...
'DiscountFactor',0.99,...
'SaveExperienceBufferWithAgent',true,...
'ResetExperienceBufferBeforeTraining',false,...
'MiniBatchSize',128);
agentOpts.NoiseOptions.Variance = 0.6;
agentOpts.NoiseOptions.VarianceDecayRate = 1e-5;
agent = rlDDPGAgent(actor,critic,agentOpts);
maxEpisode =5000;
maxStepsPerEpisode = ceil(Tf/Ts);
trainOpts = rlTrainingOptions(...
'MaxEpisodes',maxEpisode,...
'MaxStepsPerEpisode',maxStepsPerEpisode,...
'ScoreAveragingWindowLength',5,...
'Verbose',true,...
'Plots','training-progress',...
'StopTrainingCriteria','EpisodeReward',...
'StopTrainingValue',-100,...
'SaveAgentCriteria','EpisodeCount',...
'SaveAgentValue',3,...
'SaveAgentDirectory','C:\Users\Asus\Desktop\ROBOT CODE\Robot Agent');
% Train the agent.
trainingStats = train(agent,env,trainOpts);
function [InitialObservation, LoggedSignal] = myResetFunction()
% Reset function to place Robot environment into a random
% initial state.
global robot
Theta = homeConfiguration(robot);
LoggedSignal.State = Theta ;
InitialObservation = LoggedSignal.State;
end
function [NextObs,Reward,IsDone,LoggedSignals] = myStepFunction(DeltaTheta,LoggedSignals)
global robot
global T60d_StartPoint T60d_EndPoint
global worldCollisionArray
global theta_max theta_min thetaDot_max thetaDot_min
global a1 a2 a3 a4 a5 a6 d1 d2 d3 d4 d5 d6
x_tool=0.0395;
y_tool=0;
z_tool= 0.4333;
beta=pi/4;
Reward=0;
Ts = 0.05; % Sample time
for i=1:6
if DeltaTheta(i)>Ts*thetaDot_max(i)
DeltaTheta(i)=Ts*thetaDot_max(i);
elseif DeltaTheta(i)<Ts*thetaDot_min(i)
DeltaTheta(i)=Ts*thetaDot_min(i);
end
end
Theta=LoggedSignals.State(1:6);
Previous_config=Theta;
Theta_nextStep=Theta+Ts*DeltaTheta;
for i=1:6
if Theta_nextStep(i)>theta_max(i)
Theta_nextStep(i)=theta_max(i);
elseif Theta_nextStep(i)<theta_min(i)
Theta_nextStep(i)=theta_min(i);
end
end
LoggedSignals.State=Theta_nextStep;
NextObs = LoggedSignals.State;
configuration = double(Theta_nextStep);
inCollision = checkCollision(robot,configuration,worldCollisionArray,'IgnoreSelfCollision','on');
if inCollision
Reward =-10;
end
theta1=Theta(1);theta2=Theta(2);theta3=Theta(3);
theta4=Theta(4);theta5=Theta(5);theta6=Theta(6);
c1=cos(theta1);s1=sin(theta1);c2=cos(theta2);s2=sin(theta2);c3=cos(theta3);s3=sin(theta3);
c23=cos(theta2+theta3);s23=sin(theta2+theta3);c4=cos(theta4);s4=sin(theta4);
c5=cos(theta5); s5=sin(theta5);c6=cos(theta6); s6=sin(theta6);
EndeffectorTipFrame_Base.X =a1*c1 - d2*s1 - d6*(s5*(s1*s4 + c1*c4*c23) + c1*c5*s23) - z_tool*(s5*(s1*s4 + c1*c4*c23) + c1*c5*s23) + x_tool*(c6*(c5*(s1*s4 + c1*c4*c23) - c1*s5*s23) + s6*(c4*s1 - c1*c23*s4)) - y_tool*(s6*(c5*(s1*s4 + c1*c4*c23) - c1*s5*s23) - c6*(c4*s1 - c1*c23*s4)) + a2*c1*c2 - c1*d4*s23 - a3*c1*s2*s3 + a3*c1*c2*c3;
EndeffectorTipFrame_Base.Y =y_tool*(s6*(c5*(c1*s4 - c4*c23*s1) + s1*s5*s23) - c6*(c1*c4 + c23*s1*s4)) - x_tool*(c6*(c5*(c1*s4 - c4*c23*s1) + s1*s5*s23) + s6*(c1*c4 + c23*s1*s4)) + c1*d2 + a1*s1 + d6*(s5*(c1*s4 - c4*c23*s1) - c5*s1*s23) + z_tool*(s5*(c1*s4 - c4*c23*s1) - c5*s1*s23) + a2*c2*s1 - d4*s1*s23 - a3*s1*s2*s3 + a3*c2*c3*s1;
EndeffectorTipFrame_Base.Z =d1 - z_tool*(c5*c23 - c4*s5*s23) - c23*d4 - a2*s2 - a3*s23 - x_tool*(c6*(c23*s5 + c4*c5*s23) - s4*s6*s23) + y_tool*(s6*(c23*s5 + c4*c5*s23) + c6*s4*s23) - d6*(c5*c23 - c4*s5*s23);
goal.X=T60d_EndPoint(1,4);
goal.Y=T60d_EndPoint(2,4);
goal.Z=T60d_EndPoint(3,4);
distanceVector=[goal.X-EndeffectorTipFrame_Base.X ; goal.Y-EndeffectorTipFrame_Base.Y ;goal.Z-EndeffectorTipFrame_Base.Z ];
error=norm(distanceVector);
% disp(error);
% disp(stepcounter);
disp(DeltaTheta);
hold on
% Endconfig=InversKinematic(T60d_EndPoint ,Previous_config);
% configuration=wrapToPi(configuration);
% Endconfig=wrapToPi(Endconfig);
% error=abs(configuration-Endconfig);
Q=eye(6);
R=(10^-3)*eye(6);
Reward=Reward-error;
% %Check terminal condition.
IsDone=0;
if error<10^-1
disp('yes')
IsDone=1;
end
end

回答 (0 件)

カテゴリ

Help Center および File ExchangeReinforcement Learning についてさらに検索

タグ

Community Treasure Hunt

Find the treasures in MATLAB Central and discover how the community can help you!

Start Hunting!

Translated by