Skip to content

State dim march #33

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: devel
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions gym-kinova-gripper/.gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*.pyc

# C extensions
*.so
Expand Down
43 changes: 35 additions & 8 deletions gym-kinova-gripper/DDPGfD.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import copy
import numpy as np
import torch
Expand Down Expand Up @@ -73,7 +76,7 @@ def select_action(self, state):
return self.actor(state).cpu().data.numpy().flatten()


def train(self, episode_step, expert_replay_buffer, replay_buffer=None, prob=0.7):
def train(self, episode_step, expert_replay_buffer, replay_buffer=None, prob=0.7, mod_state_idx=np.arange(82)):
""" Update policy based on full trajectory of one episode """
self.total_it += 1

Expand Down Expand Up @@ -110,6 +113,15 @@ def train(self, episode_step, expert_replay_buffer, replay_buffer=None, prob=0.7
print("IN OG TRAIN: lift_reward_count: ", lift_reward_count)
"""

print("single batch...")
print('state dimensions: ', state.shape)
print('next state dimensions: ', next_state.shape)
# modify state dimensions
state = state[:, mod_state_idx]
next_state = next_state[:, mod_state_idx]
print('state dimensions: ', state.shape)
print('next state dimensions: ', next_state.shape)

# Target Q network
#print("Target Q")
target_Q = self.critic_target(next_state, self.actor_target(next_state))
Expand Down Expand Up @@ -215,7 +227,7 @@ def train(self, episode_step, expert_replay_buffer, replay_buffer=None, prob=0.7
return actor_loss.item(), critic_loss.item(), critic_L1loss.item(), critic_LNloss.item()


def train_batch(self, episode_step, expert_replay_buffer, replay_buffer, num_trajectories, prob=0.3):
def train_batch(self, episode_step, expert_replay_buffer, replay_buffer, num_trajectories, prob=0.3, mod_state_idx=np.arange(82)):
""" Update policy networks based on batch_size of episodes using n-step returns """
self.total_it += 1

Expand All @@ -239,22 +251,37 @@ def train_batch(self, episode_step, expert_replay_buffer, replay_buffer, num_tra
#print("SAMPLING FROM EXPERT...expert_batch_size: ",expert_batch_size)
expert_state, expert_action, expert_next_state, expert_reward, expert_not_done = expert_replay_buffer.sample_batch_nstep(expert_batch_size,num_trajectories)

print("what's going on here")
print(agent_state.shape)
print(expert_state.shape)

# Concatenate batches of agent and expert experience to get batch_size tensors of experience
state = torch.cat((torch.squeeze(agent_state), torch.squeeze(expert_state)), 0)
action = torch.cat((torch.squeeze(agent_action), torch.squeeze(expert_action)), 0)
next_state = torch.cat((torch.squeeze(agent_next_state), torch.squeeze(expert_next_state)), 0)
reward = torch.cat((torch.squeeze(agent_reward), torch.squeeze(expert_reward)), 0)
not_done = torch.cat((torch.squeeze(agent_not_done), torch.squeeze(expert_not_done)), 0)
if self.batch_size == 1:
state = state.unsqueeze(0)
action = action.unsqueeze(0)
next_state = next_state.unsqueeze(0)
reward = reward.unsqueeze(0)
not_done = not_done.unsqueeze(0)
# if self.batch_size == 1:
# state = state.unsqueeze(0)
# action = action.unsqueeze(0)
# next_state = next_state.unsqueeze(0)
# reward = reward.unsqueeze(0)
# not_done = not_done.unsqueeze(0)

# print(state.shape)
# print("okay done")

reward = reward.unsqueeze(-1)
not_done = not_done.unsqueeze(-1)

print('state dimensions: ', state.shape)
print('next state dimensions: ', next_state.shape)
# modify state dimensions
state = state[:, :, mod_state_idx]
next_state = next_state[:, :, mod_state_idx]
print('state dimensions: ', state.shape)
print('next state dimensions: ', next_state.shape)

### FOR TESTING:
#assert_batch_size = self.batch_size * num_trajectories
num_timesteps_sampled = len(reward)
Expand Down
86 changes: 80 additions & 6 deletions gym-kinova-gripper/main_DDPGfD.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,11 @@ def eval_policy(policy, env_name, seed, requested_shapes, requested_orientation,
#####
# Not ready for lift, continue agent grasping following the policy
if not ready_for_lift:
action = policy.select_action(np.array(state[0:82]))
# TODO: State action selection here
# modify here
# old_state_idx = np.array(state[0:82])

action = policy.select_action(np.array(state)[state_idx_arr])
eval_env.set_with_grasp_reward(args.with_grasp_reward)
next_state, reward, done, info = eval_env.step(action)
cumulative_reward += reward
Expand Down Expand Up @@ -432,8 +436,11 @@ def update_policy(evaluations, episode_num, num_episodes, num_trajectories, prob

# Follow policy until ready for lifting, then switch to set controller
if not ready_for_lift:
# TODO: Action selection
print("taking action...")
print(np.array(state)[state_idx_arr].shape)
action = (
policy.select_action(np.array(state))
policy.select_action(np.array(state)[state_idx_arr])
+ np.random.normal(0, max_action * args.expl_noise, size=action_dim)
).clip(-max_action, max_action)
# Perform action obs, total_reward, done, info
Expand Down Expand Up @@ -474,12 +481,14 @@ def update_policy(evaluations, episode_num, num_episodes, num_trajectories, prob
# Single episode training using full trajectory
actor_loss, critic_loss, critic_L1loss, critic_LNloss = policy.train(env._max_episode_steps,
expert_replay_buffer,
replay_buffer, prob)
replay_buffer, prob,
mod_state_idx=state_idx_arr)
else:
# Batch training using n-steps
actor_loss, critic_loss, critic_L1loss, critic_LNloss = policy.train_batch(env._max_episode_steps,
expert_replay_buffer,
replay_buffer, num_trajectories, prob)
replay_buffer, num_trajectories, prob,
mod_state_idx=state_idx_arr)

# Evaluation and recording data for tensorboard
if episode_num+1 == num_episodes or (episode_num > args.update_after and (episode_num) % args.eval_freq == 0):
Expand Down Expand Up @@ -908,6 +917,9 @@ def create_info_file(num_success,num_total,all_saving_dirs,extra_text=""):
parser.add_argument("--exp_num", default=None, type=int) # RL Paper: experiment number
parser.add_argument("--num_traj", default=5, type=int) # Number of trajectories to sample per episode in train_batch sampling

parser.add_argument("--state_range", default='all',
type=str) # string - from ('all', 'nigel_rangefinder', 'nigel_norangefinder', 'all_real')

args = parser.parse_args()

""" Setup the environment, state, and action space """
Expand All @@ -928,8 +940,70 @@ def create_info_file(num_success,num_total,all_saving_dirs,extra_text=""):
torch.manual_seed(args.seed)
np.random.seed(args.seed)


# for reference: state space correspondence
'''
Local obs, all in local coordinates (from the center of the palm)
(18,) Finger Pos 0-18
(3,) Wrist Pos 18-21
(3,) Obj Pos 21-24
(9,) Joint States 24-33
(3,) Obj Size 33-36
(12,) Finger Object Distance 36-48
(2,) X and Z angle 48-50
(17,) Rangefinder data 50-67
(3,) Gravity vector in local coordinates 67-70
(3,) Object location based on rangefinder data 70-73
(1,) Ratio of the area of the side of the shape to the open portion of the side of the hand 73
(1,) Ratio of the area of the top of the shape to the open portion of the top of the hand 74
(6, ) Finger dot product 75) "f1_prox", 76) "f2_prox", 77) "f3_prox", 78) "f1_dist", 79) "f2_dist", 80) "f3_dist" 75-80
(1, ) Dot product (wrist) 81

Global obs, all in global coordinates (from simulator 0,0,0)
(18,) Finger Pos 0-18
(3,) Wrist Pos 18-21
(3,) Obj Pos 21-24
(9,) Joint States 24-33
(3,) Obj Size 33-36
(12,) Finger Object Distance 36-48
(2,) X and Z angle 48-50
(17,) Rangefinder data 50-67
'''

finger_pos_idx = np.arange(0, 18)
wrist_pos_idx = np.arange(18, 21)
obj_pos_idx = np.arange(21, 24)
joint_states_idx = np.arange(24, 33)
obj_size_idx = np.arange(33, 36)
finger_obj_dist_idx = np.arange(36, 48)
x_z_angle_idx = np.arange(48, 50)
rangefinder_data_idx = np.arange(50, 67)
gravity_vector_in_local_coords = np.arange(67, 70)
object_location_rangefinder = np.arange(70, 73)
ratio_sideshape_sidehand = np.array([73])
ratio_topshape_tophand = np.array([74])
f1_prox_idx = np.array([75])
f2_prox_idx = np.array([76])
f3_prox_idx = np.array([77])
f1_dist_idx = np.array([78])
f2_dist_idx = np.array([79])
f3_dist_idx = np.array([80])
dot_prod_wrist = np.array([81])

# modified state space code
assert args.state_range in ['all', 'nigel_rangefinder', 'nigel_norangefinder', 'all_real']
if args.state_range == 'all':
state_idx_arr = np.arange(0, 82)
if args.state_range == 'nigel_rangefinder':
state_idx_arr = np.concatenate((obj_pos_idx, rangefinder_data_idx, obj_size_idx), axis=0)
if args.state_range == 'nigel_norangefinder':
state_idx_arr = np.concatenate((obj_pos_idx, finger_obj_dist_idx, obj_size_idx), axis=0)
if args.state_range == 'all_real':
state_idx_arr = np.concatenate((obj_pos_idx, joint_states_idx, obj_size_idx, finger_obj_dist_idx, x_z_angle_idx))

# Set dimensions for state and action spaces - policy initialization
state_dim = 82 # State dimension dependent on the length of the state space
state_dim = 82 # base state dimension... for the replay buffer
modified_state_dim = len(state_idx_arr) # State dimension that is dependent on the length of the state space
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
max_action_trained = env.action_space.high # a vector of max actions
Expand Down Expand Up @@ -960,7 +1034,7 @@ def create_info_file(num_success,num_total,all_saving_dirs,extra_text=""):
env.Generate_Latin_Square(args.max_episode,"objects.csv", shape_keys=requested_shapes)

kwargs = {
"state_dim": state_dim,
"state_dim": modified_state_dim,
"action_dim": action_dim,
"max_action": max_action,
"n": n,
Expand Down