diff --git a/robosuite/demos/demo_random_action.py b/robosuite/demos/demo_random_action.py
index fc7aebe4d3..e8189254ff 100644
--- a/robosuite/demos/demo_random_action.py
+++ b/robosuite/demos/demo_random_action.py
@@ -1,6 +1,8 @@
 from robosuite.controllers import load_controller_config
 from robosuite.utils.input_utils import *
 
+from robosuite.utils.visual.VisualManager import VisualManager
+
 
 if __name__ == "__main__":
 
@@ -49,20 +51,100 @@
     # initialize the task
     env = suite.make(
         **options,
-        has_renderer=True,
-        has_offscreen_renderer=False,
+        has_renderer=False,
+        has_offscreen_renderer=True,
         ignore_done=True,
-        use_camera_obs=False,
+        use_camera_obs=True,
         control_freq=20,
+        #camera_names = ['agentview','agentview2']
     )
     env.reset()
-    env.viewer.set_camera(camera_id=0)
+    #env.set_camera(camera_id=0)
 
     # Get action limits
     low, high = env.action_spec
 
+    eyes = VisualManager(
+        MODEL_ROOT ='/home/dizzyi/GNN/detectron/tutorial/output',               
+                                         # The directory to the model
+            
+        DATA_ROOT  = './imagesave',      # THe directoey to save image and data
+            
+        verbose    = True,               # verbose
+
+        train_schedule     = (10_000,),  # The trainer will tune the model when saved image hit the number listed
+
+        preprocessor_kwarg = dict(
+            mask_size = (128,128),       # size that image will be wrap to
+            grayscale  = True,           # allow gray for more information
+            threshold  = 0.5,            # thresold of confident score
+            backbone   = None,           # backbone for image and masks
+            getVec     = None,           # get vector from feature map
+            norm       = None,           # norm layer for image and masks
+            acti       = None,           # activation layer for image and masks
+        ),   
+        imagesaver_kwarg = dict(   
+            save_mode = True,            # True to turn on image saving mode
+            save_freq = 5                # how often will save image and annotations
+        ),   
+        trainer_kwarg = dict(   
+            NUM_CLASSES    = 20,         # Number for classes for classify
+            train_mode     = True,       # True to turn on training mode
+            NEW_MODEL_ROOT =  './new_model',       
+                                        # The directory that all newly tuned model will be saved 
+        )
+    )
+
+    from PIL import Image
+    import time
     # do visualization
     for i in range(1000):
+
+        print("--------------------------------------")
+        #delta = time.time()
         action = np.random.uniform(low, high)
         obs, reward, done, _ = env.step(action)
-        env.render()
+
+        feature_vectors = eyes(obs['agentview_image'],env)
+        
+        '''
+
+        img = Image.fromarray(img).rotate(180)
+        segment = Image.fromarray(seg).rotate(180)
+
+        objects = {}
+        for i in seg.reshape(-1,3):
+            name = env.sim.model.geom_id2name(i[1])
+            objects[i[1]] = name
+        
+        for k, v in sorted(objects.items()): print(k, v.split("_") if v is not None else v)
+
+        # objects [id] => name
+        # ids list of M id 
+        ids = np.unique(seg)
+        #ids = np.array(list(filter(lambda id: objects[id] != None, ids)))
+
+        # mask (256,256,1) with M ID
+        _,mask,_ = np.split(seg,3,axis=2)
+
+        # mask[np.newaxis]                           ==> ( 1, 256, 256, 1)
+        # ids[:, np.newaxis, np.newaxis, np.newaxis] ==> ( M,   1,   1, 1)
+        #                                                 L Broadcastable
+
+        # masks  ==> (M, 256, 256, 1) 
+        masks = ( mask[np.newaxis] == ids[:, np.newaxis, np.newaxis, np.newaxis]).squeeze().astype(np.uint8)
+        #masks = np.array(list( filter( lambda m: m.sum() > 100, masks ) )) outdated
+        masks = masks * 255
+
+
+
+        img.save('./image.png')
+        segment.save('./segment.png')
+        
+        for ind, msk in enumerate( masks ):
+            seg_png = Image.fromarray(msk,mode='L').rotate(180)
+            seg_png.save(f'./seg/{ids[ind]}-{objects[ids[ind]]}.png')
+            print(ind)
+        '''
+        #print(time.time()-delta)
+        #env.render()
diff --git a/robosuite/environments/robot_env.py b/robosuite/environments/robot_env.py
index 2b3f77b41b..98594e9c06 100644
--- a/robosuite/environments/robot_env.py
+++ b/robosuite/environments/robot_env.py
@@ -361,12 +361,31 @@ def camera_rgb(obs_cache):
                 height=cam_h,
                 depth=cam_d,
             )
+            ######################################################################################
+            # Added to get render images as segmentation also
+            img_seg = self.sim.render(
+                camera_name=cam_name,
+                width=cam_w,
+                height=cam_h,
+                depth=cam_d,
+                segmentation=True
+            )
+            img_seg = np.concatenate( (img_seg, np.zeros( (256,256,1), dtype=np.uint8) ), axis=2 )
+            ######################################################################################
             if cam_d:
                 rgb, depth = img
                 obs_cache[depth_sensor_name] = np.expand_dims(depth[::convention], axis=-1)
-                return rgb[::convention]
+
+                ##################################################################################
+                
+                return (rgb[::convention], img_seg[::convention])
+                ##################################################################################
             else:
-                return img[::convention]
+
+                ##################################################################################
+                # Modified to return (2, 256, 256, 3) (rgb, seg)
+                return (img[::convention], img_seg[::convention])
+                ##################################################################################
 
         sensors.append(camera_rgb)
         names.append(rgb_sensor_name)
diff --git a/robosuite/utils/visual/Trainer.py b/robosuite/utils/visual/Trainer.py
new file mode 100644
index 0000000000..2f513617bd
--- /dev/null
+++ b/robosuite/utils/visual/Trainer.py
@@ -0,0 +1,121 @@
+import torch
+
+from detectron2.utils.logger import setup_logger
+setup_logger()
+
+# import some common libraries
+import numpy as np
+import os, json, pickle
+
+#import some common detectron2 utilities
+from detectron2.data.datasets import register_coco_instances
+from detectron2.engine import DefaultTrainer
+from detectron2.data import DatasetCatalog
+
+class Trainer():
+    def __init__(
+        self,
+        NUM_CLASSES    = 20,
+        train_mode     = False,
+        DATA_ROOT      = None,
+        MODEL_ROOT     = None,
+        NEW_MODEL_ROOT = None   
+        ):
+        self.train_mode     = train_mode
+        self.DATA_ROOT      = DATA_ROOT
+        self.MODEL_ROOT     = [MODEL_ROOT]
+        self.NEW_MODEL_ROOT = NEW_MODEL_ROOT
+        self.NUM_CLASSES    = NUM_CLASSES
+        if self.train_mode:
+            assert MODEL_ROOT     is not None, "Need to provide MODEL_ROOT"
+            assert DATA_ROOT      is not None, "Need to provide DATA_ROOT"
+            assert NEW_MODEL_ROOT is not None, "Need to provide NEW_DATA_ROOT"
+
+        
+    def train(self, sche, hyperparam_kwarg = None):
+        self.current_dir = sche
+        self.set_hyperparam(**hyperparam_kwarg)
+
+        trainer = DefaultTrainer(self.cfg) 
+        trainer.resume_or_load(resume=False)
+        trainer.train()
+        
+        with open(os.path.join(self.NEW_MODEL_ROOT, sche , 'model_cfg.pickle'), 'wb') as f:
+            pickle.dump(self.cfg,f)
+            
+        self.MODEL_ROOT.append( os.path.join(self.NEW_MODEL_ROOT, self.current_dir) )
+        
+        
+    def get_current_root(self):
+        return self.MODEL_ROOT[-1]
+
+    def set_hyperparam(self):
+
+        def VisualManager_Trainer_dataset_function():
+            returnList = []
+            for file in sorted(os.listdir(self.DATA_ROOT)):
+                if not file.endswith('.pickle'): continue
+                
+                f_path = os.path.join(self.DATA_ROOT,file)
+
+                with open(f_path,'rb') as f:
+                    returnList.append(pickle.load(f))
+            return returnList
+        
+        DatasetCatalog.register('VisualManager_Trainer_Dataset', VisualManager_Trainer_dataset_function)
+        
+
+        with open(os.path.join(self.MODEL_ROOT[-1], 'model_cfg.pickle'), 'rb') as f:
+            self.cfg = pickle.load(f)
+
+        self.cfg.DATASETS.TRAIN = ('VisualManager_Trainer_Dataset',)
+
+        self.cfg.MODEL.WEIGHTS = os.path.join(self.MODEL_ROOT[-1], "model_final.pth")
+        
+        # Detectron default 4
+        self.cfg.DATALOADER.NUM_WORKERS = 4
+        # Detectron default 40000
+        self.cfg.SOLVER.MAX_ITER = 120_000
+        '''
+        Detectron default 
+        Base Learning rate 0.001
+        GAMMA              0.1 
+        STEP               (30000,)
+            GAMMA : Learning rate decay factor
+            STEPS: num of iter for learning rate decay by gamma
+        
+        MASK RCNN PAPER : https://arxiv.org/pdf/1703.06870.pdf
+            Base LR 0.02
+            decay by 10 @ 120k/160k
+            
+            Cityscapes finetuning 
+                Base LR 0.001
+                decay by 10 @ 18k/24k
+            
+            update baseline
+                Base LR 0.001
+                decay by 10 @ 120k,160k/180k
+            
+            Benefit form deeper model
+        '''   
+        self.cfg.SOLVER.BASE_LR      = 0.001  
+        self.cfg.SOLVER.GAMMA        = 0.1 
+        self.cfg.SOLVER.STEPS        = (90_000,)
+        self.cfg.SOLVER.WEIGHT_DECAY = 0.000_1
+
+
+        #   ROI_HEADS.BATCH_SIZE_PER_IMAGE * SOLVER.IMS_PER_BATCH
+        #   E.g., a common configuration is: 512 * 16 = 8192
+        # Detectron default 16
+        self.cfg.SOLVER.IMS_PER_BATCH = 32
+        # Detectron default 512
+        self.cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 2048
+
+        # Number of classes 
+        self.cfg.MODEL.ROI_HEADS.NUM_CLASSES = self.NUM_CLASSES 
+
+        # Confident Level
+        self.cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.7   # set a custom testing threshold
+
+        self.cfg.OUTPUT_DIR = os.path.join(self.NEW_MODEL_ROOT, self.current_dir)
+
diff --git a/robosuite/utils/visual/VisualManager.py b/robosuite/utils/visual/VisualManager.py
new file mode 100644
index 0000000000..c520a584ec
--- /dev/null
+++ b/robosuite/utils/visual/VisualManager.py
@@ -0,0 +1,478 @@
+################################################################
+'''
+README
+
+# VisualManager
+    The Visual Manager is a class which handle all visual related task,
+    which composited by:
+    - Preprocessor
+        extract object's feature vector form a image input
+    - ImageSaver
+        annotate and save data for futher training
+    - Trainer
+        tune a new model form the old model on the data that we generated
+    
+
+    
+
+    # EXAMPLE
+    # constructor
+
+    ----------------------------------------------------------------------------------------------------------------------------
+    eyes = VisualManager(
+        MODEL_ROOT = path,               # The directory to the model
+            
+        DATA_ROOT  = path,               # THe directoey to save image and data
+            
+        verbose    = True,               # verbose
+
+        train_schedule     = (10_000,),  # The trainer will tune the model when saved image hit the number listed
+
+        preprocessor_kwarg = dict(
+            mask_size = (128,128),       # size that image will be wrap to
+            grayscale  = True,           # allow gray for more information
+            threshold  = 0.5,            # thresold of confident score
+            backbone   = None,           # backbone for image and masks
+            getVec     = None,           # get vector from feature map
+            norm       = None,           # norm layer for image and masks
+            acti       = None,           # activation layer for image and masks
+        ),   
+        imagesaver_kwarg = dict(   
+            save_mode = True,            # True to turn on image saving mode
+            save_freq = 100              # how often will save image and annotations
+        ),   
+        trainer_kwarg = dict(   
+            NUM_CLASSES    = 20,         # Number for classes for classify
+            train_mode     = True,       # True to turn on training mode
+            NEW_MODEL_ROOT = path,       # The directory that all newly tuned model will be saved 
+        )
+    )
+    ----------------------------------------------------------------------------------------------------------------------------
+
+    # use
+    feature_vectors = eyes(obs['agentview_image'],env)
+        ! obs can have any other camera name
+        ! must be passed for annotation purposes
+
+
+# Preprocessor
+    # constructor
+    preprocessor = Preprocessor(
+        MODEL_ROOT = '{MODEL_ROOT}',  <= The Root directory of the model
+        mask_size = (128,128),        <= The size of mask will be converted
+        garyscale = True              <= if the image will be grayscaled
+    )
+
+    use
+
+    preprocessed_feature_vector = preprocessor( img )  
+        # input
+        #   img should have shape => ( Height, Width, Channel )
+        # output
+        #   (N, embedded size)
+
+
+'''
+################################################################
+"""
+MODEL_ROOT
+    L model_cfg.pickle    <==
+    L model_final.pth
+"""
+"""
+IMAGE_SAVE_DIR
+    L {id}.pickle        <== a brunch of pickle and png pair
+    L {id}.png
+"""
+
+
+import os
+import pickle
+from PIL import Image, ImageFilter
+import time
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from pycocotools.mask import encode as Mask2RLE
+
+from detectron2.engine import DefaultPredictor
+from detectron2.structures import BoxMode
+
+from robosuite.utils.visual.Trainer import Trainer
+
+class Preprocessor(nn.Module):
+    def __init__(
+        self,
+        MODEL_ROOT      = None,
+        mask_size       = (128,128),
+        grayscale       = True,
+        threshold       = 0.5,
+        backbone        = None,
+        getVec          = None, 
+        norm            = None,
+        acti            = None
+        ):
+        super(Preprocessor, self).__init__()
+        assert MODEL_ROOT is not None
+        # Load the config and weight of model and construct the predictor 
+        self.threshold = threshold
+        self.load_model(MODEL_ROOT)
+
+        self.mask_size = mask_size
+        self.format = 'L' if grayscale else "1"
+
+        ##############################################################################
+        #  Learnable Network
+        ##############################################################################
+        self.backbone = nn.Sequential(
+            nn.Conv2d(4, 16, 3, 1, 1),
+            nn.Conv2d(16, 16, 3, 1, 1),
+            nn.MaxPool2d(2, stride = 2),  # size shrink half
+            nn.ReLU(),
+
+            nn.Conv2d(16, 16, 3, 1, 1),
+            nn.Conv2d(16, 16, 3, 1, 1),
+            nn.MaxPool2d(2, stride = 2),  # size shrink half
+            nn.ReLU(),
+
+            nn.Conv2d(16, 16, 3, 1, 1),
+            nn.Conv2d(16, 1, 3, 1, 1),
+            nn.MaxPool2d(2, stride = 2),  # size shrink half
+            nn.ReLU()
+        ) if backbone is None else backbone
+
+        self.getVec = nn.Sequential(
+            nn.Linear( int(mask_size[0]/8 * mask_size[1]/8) + 6, 128),
+            nn.Linear( 128, 128),
+            nn.Linear( 128, 64),
+        ) if getVec is None else getVec
+
+        self.norm = nn.LayerNorm(self.mask_size) if norm is None else norm
+        self.acti = nn.Tanh() if acti is None else acti
+        ##############################################################################
+        self.testdrive()
+
+        
+    def forward(self, img):
+        # make sure the input shape is ( Height, Width, Channel )
+        assert len(img.shape) == 3, "ERROR: The input is not in a shape of ( Height, Width, Channel ), input does not have 3 dimension"
+        assert img.shape[2]   == 3, "ERROR: The input is not in a shape of ( Height, Width, Channel ), input does not have 3 channel"
+
+        img = np.array(img)
+
+
+        instances = self.predictor(img)["instances"]
+        N = len(instances)
+
+        if N == 0: return torch.tensor([[0]])
+
+        '''
+        instances.pred_boxes
+            Boxes object storing N object
+            instances.pred_boxes.tensor return => (N, 4) matrix
+        instances.pred_classes shape: (N)
+        instnaces.pred_mask    shape: (N, H, W)
+        instances.score        shape: (N)
+        
+        img                    shape: (H, W, C)
+        '''
+        info = torch.cat( 
+        (
+            instances.pred_boxes.tensor,
+            instances.pred_classes.unsqueeze(1),
+            instances.scores.unsqueeze(1)
+        ), dim = 1)
+
+        masks = [ 
+            np.asarray(
+                Image.fromarray(
+                    m.detach().numpy()
+                ).convert( self.format ).resize( self.mask_size )
+            )
+            for m in instances.pred_masks
+        ]
+        masks = torch.tensor( np.asarray(masks) , dtype = torch.float).unsqueeze(1)
+
+        image = torch.tensor(
+            np.asarray(Image.fromarray(img).resize( self.mask_size )),
+            dtype=torch.float32
+        ).permute((2,0,1))
+        image = self.acti(self.norm(image.repeat(N,1,1,1)))
+        '''
+        N       : number of instances idenify in the image
+        HS, WS  : pre-defined number of the resized mask, default (128, 128)
+        info    : tensor shape: (N, 6) <- the six dim are : (x1, y1, x2, y2, classes_id, score)
+        masks   : tensor shape: (N, 1, HS, WS)
+        image   : tensor shape: (N, 3, HS, WS)
+        imgnseg : tensor shape: (N, 4, HS, WS)
+        '''
+        
+        imgnseg = torch.cat((masks,image),dim = 1)
+
+        assert imgnseg.shape == (N, 4, *self.mask_size)
+        assert info.shape    == (N, 6)
+
+        feature_maps = self.backbone(imgnseg).reshape((N,-1))
+        '''
+        feature_map
+            tensor shape: (N, HS/8 * WS/8)
+        '''
+
+        vector = torch.cat( (feature_maps,info) ,dim=1)
+
+        return self.getVec(vector)
+
+    def testdrive(self):
+        N   = 12
+        H,W = self.mask_size
+
+        with torch.no_grad():
+
+            test_info  = torch.rand(N, 6)
+            test_masks = torch.rand(N, 1, H, W)
+            test_image = torch.rand(H, W, 3).permute((2,0,1))
+            
+            try:
+                test_masks = self.acti(test_masks)
+            except:
+                raise Exception("The specified acti layer is not compatible")
+
+            try:
+                test_image = self.norm(test_image.repeat(N,1,1,1))
+            except:
+                raise Exception("THe specifed norm layer is not compatible")
+
+
+            test_imgnseg = torch.cat((test_masks,test_image),dim=1)
+
+            try:
+                test_map = self.backbone( test_imgnseg ).reshape((N,-1))
+            except:
+                raise Exception("The specifed backbone layer is not compatible")
+            vector   = torch.cat( (test_map,test_info) ,dim=1)
+
+            try:
+                vec      = self.getVec( vector )
+            except:
+                raise Exception("The specified getVec is not compatible")
+            assert len(vec.shape) == 2
+
+    def load_model(self, MODEL_ROOT):
+        self.MODEL_ROOT = MODEL_ROOT
+        with open(os.path.join(self.MODEL_ROOT, 'model_cfg.pickle'), 'rb') as f:
+            cfg = pickle.load(f)
+
+        cfg.MODEL.WEIGHTS = os.path.join(self.MODEL_ROOT, "model_final.pth")  # path to the model we just trained
+        cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = self.threshold # set a custom testing threshold
+
+        self.predictor = DefaultPredictor(cfg)
+
+
+class ImageSaver():
+    def __init__(
+        self,
+        save_mode = False,
+        save_freq = None,
+        DATA_ROOT = None,
+        ):
+        self.save_mode = save_mode
+        if self.save_mode is True:
+            assert save_freq is not None, "save mode is on but saves frequency is not provide, try save_mode = False, or save_freq = 1_000"
+            assert DATA_ROOT is not None, "save mode is on but Image directory is not provide, try save_mode = False, or provide directory to save the data"
+            self.DATA_ROOT = DATA_ROOT
+            self.save_freq = save_freq
+        self.counter   = 0
+        
+
+    def __call__(self, img, seg, env):
+        if self.save_mode is False:
+            return None
+        if self.counter == 0:
+            self.save(img.astype(np.uint8), seg, env)
+        self.counter = (self.counter + 1) % self.save_freq
+        return self.counter == 1
+
+    def pre_save(self):
+        if not os.path.isdir(self.DATA_ROOT):
+            os.mkdir(self.DATA_ROOT)
+
+    def save(self, img, seg, env):
+        self.pre_save()
+
+        id = int(time.time())
+        while id in os.listdir():
+            id = int(time.time())
+        
+        with open(os.path.join(self.DATA_ROOT,f'{id}.pickle') , 'wb') as f:
+            pickle.dump( self.seg2anno(seg, env, id) ,f)
+        Image.fromarray(img).save(os.path.join(self.DATA_ROOT,f'{id}.png'))
+        
+        #raise Exception("This function is not finish yet")
+        
+    def seg2anno(self, seg, env, img_id):
+        # objects: a dictionary [id] => name
+        # ids: list of M id 
+        returnDict = {}
+        returnDict['file_name']   = os.path.join(self.DATA_ROOT, f'{img_id}.png')
+        returnDict['height']      = len(seg)
+        returnDict['width']       = len(seg[0])
+        returnDict['image_id']    = img_id
+        returnDict['annotations'] = []
+
+
+        # mask (256,256,1) with M ID
+        _,mask,_ = np.split(seg,3,axis=2)
+
+        objects = {}
+        ids = np.unique(mask)
+        for id in ids:
+            name = env.sim.model.geom_id2name(id)
+            objects[id] = name
+
+        for k, v in sorted(objects.items()): print(k, v.split("_") if v is not None else v)
+
+        # mask.squeeze()[np.newaxis]                 ==> ( 1, 256, 256)
+        # ids[:, np.newaxis, np.newaxis, np.newaxis] ==> ( M,   1,   1)
+        #                                                 L Broadcastable to
+                                                        #( M, 256, 256)
+        # masks  ==> (M, 256, 256, 1) 
+        masks = (mask.squeeze()[np.newaxis] == ids[:, np.newaxis, np.newaxis]).astype(np.uint8)
+        #masks = np.asarray(masks)  
+
+        # id     : list of id
+        # object : map<id,name>
+        # masks  : masks
+        have_name = False
+        no_name_counter = 0
+        name2idMask = {}
+        for _id, _mask in zip(ids,masks):
+            _name = objects[_id]
+            if _name is not None:
+                _name = _name.split('_')[0]
+            if _name is None and have_name:
+                have_name = False
+                no_name_counter += 1
+            if _name is not None and not have_name:
+                have_name = True
+
+            if _name is None:
+                _name = f'bin{no_name_counter}'
+            if _name in name2idMask:
+                old_mask, old_id = name2idMask[_name]
+                name2idMask[_name] = (old_mask + _mask, old_id + [_id])
+            else:
+                name2idMask[_name] = (_mask, [_id])
+
+        # name2idMask : dict< one word name : ( mask<256,256> , list<id> ) >
+
+
+
+        for k,v in sorted(name2idMask.items()):
+            _mask, _ids = v
+            #if not this_Instance_Should_be_Saved(_ids): continue
+            annoDict = {}
+
+            annoDict['bbox']        = self.mask2BBox(_mask)
+            annoDict['bbox_mode']   = BoxMode.XYXY_ABS
+
+            annoDict['category_id'] = 1
+            #annoDict['category_id'] = mapGeomIDtoCategoryID(_ids[0])
+
+            _vis  = np.asarray(_mask * 255 / _mask.max()).astype(np.uint8)
+            #_vis = Image.fromarray(_vis, mode='L').convert('1')
+            _vis = Image.fromarray(_vis, mode='L').filter(ImageFilter.MinFilter(3)).filter(ImageFilter.MaxFilter(3)).convert('1')
+            
+            _mask = np.asarray(_vis)
+
+            _RLE = Mask2RLE( np.asarray( _mask,dtype=np.uint8, order= 'F') )
+            annoDict['segmentation'] = _RLE
+            
+            
+            returnDict['annotations'].append(annoDict)
+            _vis.save(os.path.join('.','filter_3',f'{img_id}_{k}.png'))
+
+
+        return returnDict
+
+    def mask2BBox(self, mask):
+        rows       = np.any(mask,axis=0)
+        cols       = np.any(mask,axis=1)
+        rmin, rmax = np.where(rows)[0][[0,-1]]
+        cmin, cmax = np.where(cols)[0][[0,-1]]
+        return [rmin, cmin, rmax, cmax]
+
+
+class VisualManager():
+    def __init__(
+        self,
+        MODEL_ROOT         = None,
+        DATA_ROOT          = None,
+        verbose            = True,
+        train_schedule     = (10_000,),
+        _preprocessor      = Preprocessor,
+        preprocessor_kwarg = dict(),
+        _imagesaver        = ImageSaver,
+        imagesaver_kwarg   = dict(),
+        _trainer           = Trainer,
+        trainer_kwarg      = dict(),
+        ):
+        preprocessor_kwarg["MODEL_ROOT"] = MODEL_ROOT
+        self.preprocessor = _preprocessor(**preprocessor_kwarg)
+
+        imagesaver_kwarg["DATA_ROOT"] = DATA_ROOT
+        self.imagesaver = _imagesaver(**imagesaver_kwarg)
+        
+        self.trainer = _trainer(
+            MODEL_ROOT = MODEL_ROOT,
+            DATA_ROOT = DATA_ROOT,
+            **trainer_kwarg
+            )
+        
+        self.verbose        = verbose
+        self.image_saved    = 0
+        self.train_schedule = train_schedule
+
+        if not self.imagesaver.save_mode and self.trainer.train_mode:
+            print("[VisualManager]Warning: the train_mode is on but save_mode is not on, it will not train when VisualManager is call, ")
+            _sanity = input('[VisualManager]but you can call VisualManager.train() to force train, are you sure?[y/n(default, will raise error)]')
+            assert _sanity == 'Y' or _sanity == 'y', '[VisaulManager]train_mode is on, while save_mode is not'
+        if self.verbose: print("[VisualManager]Finished Init")
+
+
+    def __call__(self,vis,env):
+        if self.verbose and self.imagesaver.save_mode :
+            print(f"[VisualManager]Datasaver  : {self.imagesaver.counter}/{self.imagesaver.save_freq}")
+            print(f"[VisualManager]image saved: {self.image_saved}")
+
+        img, seg = np.array(vis).astype(np.uint8)
+
+        img = np.rot90(img,k=2)
+        seg = np.rot90(seg,k=2)
+
+        if self.imagesaver(img, seg, env): 
+
+            self.image_saved += 1
+            if self.verbose:    
+                print("[VisualManager]One Image saved")
+                print(f"[VisualManager]train schedule:", self.train_schedule)
+
+            if self.image_saved in self.train_schedule and self.trainer.train_mode: 
+                if self.verbose: print("[VisualManager]Trainer start training")
+                self.trainer.train(sche = f"tune_model_{self.image_saved}")
+                self.preprocessor.load_model(self.trainer.get_current_root())
+
+        # return embedded vectors
+        if self.verbose: print("[VisualManager]returning feature vectors...")
+        
+        return self.preprocessor(img)
+
+
+    def train(self,train_name = "force-train"):
+        print("[VisualManager]unschedule train")
+        
+        if self.verbose: print("[VisualManager]Trainer start training")
+
+        self.trainer.train(sche = f"tune_model_{train_name}")
+        self.preprocessor.load_model(self.trainer.get_current_root())
diff --git a/robosuite/utils/visual/__init__.py b/robosuite/utils/visual/__init__.py
new file mode 100644
index 0000000000..e69de29bb2