h8 this. saving progress, but I think I need to just backtrack and simplify how losses and image_models work first, then come back to this afterwards.

dmarx · dmarx · commit 5308c662daa3 · 2022-06-20T11:00:01.000-07:00
diff --git a/src/pytti/LossAug/LatentLossClass.py b/src/pytti/LossAug/LatentLossClass.py
@@ -26,16 +26,20 @@ def __init__(
         self.pil_image = None
         self.has_latent = False
         w, h = image_shape
-        try:
-            comp_adjusted = TF.resize(comp.clone(), (h, w))
-        except:
-            # comp_adjusted = comp.clone()
-            # Need to convert the latent to its image form
-            comp_adjusted = img_model.decode_tensor(comp.clone())
+        comp_adjusted = TF.resize(comp.clone(), (h, w))
+        # try:
+        #     comp_adjusted = TF.resize(comp.clone(), (h, w))
+        # except:
+        #     # comp_adjusted = comp.clone()
+        #     # Need to convert the latent to its image form
+        #     comp_adjusted = img_model.decode_tensor(comp.clone())
         self.direct_loss = MSELoss(comp_adjusted, weight, stop, name, image_shape)
 
     @torch.no_grad()
     def set_comp(self, pil_image, device=DEVICE):
+        """
+        sets the DIRECT loss anchor "comp" to the tensorized image.
+        """
         logger.debug(type(pil_image))
         self.pil_image = pil_image
         self.has_latent = False
@@ -47,6 +51,10 @@ def set_comp(self, pil_image, device=DEVICE):
 
     @classmethod
     def convert_input(cls, input, img):
+        """
+        Converts the input image tensor to the image representation of the image model.
+        E.g. if img is VQGAN, then the input tensor is converted to the latent representation.
+        """
         logger.debug(type(input))  # pretty sure this is gonna be tensor
         # return input # this is the default MSE loss version
         return img.make_latent(input)
@@ -107,25 +115,62 @@ def get_loss(self, input, img):
         logger.debug(
             self.comp.shape
         )  # [1 1 1 1] -> from target image constructor when no input image provided
+
+        # why is the latent comp only set here? why not in the __init__ and set_comp?
         if not self.has_latent:
             # make_latent() encodes the image through a dummy class instance, returns the resulting fitted image representation
             # if get_image_tensor() is not implemented, then the returned 'latent' tensor is just the tensorized pil image
             latent = img.make_latent(self.pil_image)
             logger.debug(type(latent))  # EMAParametersDict
             logger.debug(type(self.comp))  # torch.Tensor
             with torch.no_grad():
-                self.comp.set_(latent.clone())
+                if type(latent) == type(self.comp):
+                    self.comp.set_(latent.clone())
+                # else:
+
             self.has_latent = True
+
         l1 = super().get_loss(img.get_latent_tensor(), img) / 2
         l2 = self.direct_loss.get_loss(input, img) / 10
         return l1 + l2
 
 
 ######################################################################
 
+# fuck it, let's just make a dip latent loss from scratch.
+
+
+# The issue we're resolving here is that by inheriting from the MSELoss,
+# I can't easily set the comp to the parameters of the image model.
+
+from pytti.LossAug.BaseLossClass import Loss
+from pytti.image_models.ema import EMAImage, EMAParametersDict
+from pytti.rotoscoper import Rotoscoper
+
+import deep_image_prior
+import deep_image_prior.models
+from deep_image_prior.models import (
+    get_hq_skip_net,
+    get_non_offset_params,
+    get_offset_params,
+)
 
-class LatentLossGeneric(LatentLoss):
-    # class LatentLoss(MSELoss):
+
+def load_dip(input_depth, num_scales, offset_type, offset_groups, device):
+    dip_net = get_hq_skip_net(
+        input_depth,
+        skip_n33d=192,
+        skip_n33u=192,
+        skip_n11=4,
+        num_scales=num_scales,
+        offset_type=offset_type,
+        offset_groups=offset_groups,
+    ).to(device)
+
+    return dip_net
+
+
+class LatentLossDIP(Loss):
     @torch.no_grad()
     def __init__(
         self,
@@ -134,29 +179,109 @@ def __init__(
         stop=-math.inf,
         name="direct target loss",
         image_shape=None,
+        device=None,
     ):
-        super().__init__(comp, weight, stop, name, image_shape)
+        ##################################################################
+        super().__init__(weight, stop, name, device)
+        if image_shape is None:
+            raise
+            # height, width = comp.shape[-2:]
+            # image_shape = (width, height)
+        self.image_shape = image_shape
+        self.register_buffer("mask", torch.ones(1, 1, 1, 1, device=self.device))
+        self.use_mask = False
+        ##################################################################
         self.pil_image = None
         self.has_latent = False
-        w, h = image_shape
-        self.direct_loss = MSELoss(
-            TF.resize(comp.clone(), (h, w)), weight, stop, name, image_shape
+        logger.debug(type(comp))  # inits to image tensor
+        if comp is None:
+            comp = self.default_comp()
+        if isinstance(comp, EMAParametersDict):
+            logger.debug("initializing loss from latent")
+            self.register_module("comp", comp)
+            self.has_latent = True
+        else:
+            w, h = image_shape
+            comp_adjusted = TF.resize(comp.clone(), (h, w))
+            # try:
+            #     comp_adjusted = TF.resize(comp.clone(), (h, w))
+            # except:
+            #     # comp_adjusted = comp.clone()
+            #     # Need to convert the latent to its image form
+            #     comp_adjusted = img_model.decode_tensor(comp.clone())
+            self.direct_loss = MSELoss(comp_adjusted, weight, stop, name, image_shape)
+
+        ##################################################################
+
+        logger.debug(type(comp))
+
+    @classmethod
+    def default_comp(*args, **kargs):
+        logger.debug("default_comp")
+        device = kargs.get("device", "cuda") if torch.cuda.is_available() else "cpu"
+        net = load_dip(
+            input_depth=32,
+            num_scales=7,
+            offset_type="none",
+            offset_groups=4,
+            device=device,
         )
+        return EMAParametersDict(z=net, decay=0.99, device=device)
+
+    ###################################################################################
 
     @torch.no_grad()
     def set_comp(self, pil_image, device=DEVICE):
+        """
+        sets the DIRECT loss anchor "comp" to the tensorized image.
+        """
+        logger.debug(type(pil_image))
         self.pil_image = pil_image
         self.has_latent = False
-        self.direct_loss.set_comp(
-            pil_image.resize(self.image_shape, Image.LANCZOS)
+        im_resized = pil_image.resize(
+            self.image_shape, Image.LANCZOS
         )  # to do: ResizeRight
+        # self.direct_loss.set_comp(im_resized)
+
+        im_tensor = (
+            TF.to_tensor(pil_image)
+            .unsqueeze(0)
+            .to(device, memory_format=torch.channels_last)
+        )
+
+        if hasattr(self, "direct_loss"):
+            self.direct_loss.set_comp(im_tensor)
+        else:
+            self.direct_loss = MSELoss(
+                im_tensor, self.weight, self.stop, self.name, self.image_shape
+            )
+        # self.direct_loss.set_comp(im_resized)
+
+    @classmethod
+    def convert_input(cls, input, img):
+        """
+        Converts the input image tensor to the image representation of the image model.
+        E.g. if img is VQGAN, then the input tensor is converted to the latent representation.
+        """
+        logger.debug(type(input))  # pretty sure this is gonna be tensor
+        # return input # this is the default MSE loss version
+        return img.make_latent(input)
 
     @classmethod
     @vram_usage_mode("Latent Image Loss")
     @torch.no_grad()
     def TargetImage(
-        cls, prompt_string, image_shape, pil_image=None, is_path=False, device=DEVICE
+        cls,
+        prompt_string,
+        image_shape,
+        pil_image=None,
+        is_path=False,
+        device=DEVICE,
+        img_model=None,
     ):
+        logger.debug(
+            type(pil_image)
+        )  # None. emitted prior to do_run:559 but after parse_scenes:122. Why even use this constructor if no pil_image?
         text, weight, stop = parse(
             prompt_string, r"(?<!^http)(?<!s):|:(?!/)", ["", "1", "-inf"]
         )
@@ -168,24 +293,69 @@ def TargetImage(
         comp = (
             MSELoss.make_comp(pil_image)
             if pil_image is not None
-            else torch.zeros(1, 1, 1, 1, device=device)
+            # else torch.zeros(1, 1, 1, 1, device=device)
+            else cls.default_comp(img_model=img_model)
         )
         out = cls(comp, weight, stop, text + " (latent)", image_shape)
         if pil_image is not None:
             out.set_comp(pil_image)
-        out.set_mask(mask)
+        if (
+            mask
+        ):  # this will break if there's no pil_image since the direct_loss won't be initialized
+            out.set_mask(mask)
         return out
 
     def set_mask(self, mask, inverted=False):
         self.direct_loss.set_mask(mask, inverted)
-        super().set_mask(mask, inverted)
+        # super().set_mask(mask, inverted)
+        # if device is None:
+        device = self.device
+        if isinstance(mask, str) and mask != "":
+            if mask[0] == "-":
+                mask = mask[1:]
+                inverted = True
+            if mask.strip()[-4:] == ".mp4":
+                r = Rotoscoper(mask, self)
+                r.update(0)
+                return
+            mask = Image.open(fetch(mask)).convert("L")
+        if isinstance(mask, Image.Image):
+            with vram_usage_mode("Masks"):
+                mask = (
+                    TF.to_tensor(mask)
+                    .unsqueeze(0)
+                    .to(device, memory_format=torch.channels_last)
+                )
+        if mask not in ["", None]:
+            self.mask.set_(mask if not inverted else (1 - mask))
+        self.use_mask = mask not in ["", None]
 
     def get_loss(self, input, img):
+        logger.debug(type(input))  # Tensor
+        logger.debug(input.shape)  # this is an image tensor
+        logger.debug(type(img))  # DIPImage
+        logger.debug(type(self.comp))  # EMAParametersDict
+        # logger.debug(
+        #    self.comp.shape
+        # )  # [1 1 1 1] -> from target image constructor when no input image provided
+
+        # why is the latent comp only set here? why not in the __init__ and set_comp?
         if not self.has_latent:
+            raise
+            # make_latent() encodes the image through a dummy class instance, returns the resulting fitted image representation
+            # if get_image_tensor() is not implemented, then the returned 'latent' tensor is just the tensorized pil image
             latent = img.make_latent(self.pil_image)
+            logger.debug(type(latent))  # EMAParametersDict
+            logger.debug(type(self.comp))  # torch.Tensor
             with torch.no_grad():
-                self.comp.set_(latent.clone())
+                if type(latent) == type(self.comp):
+                    self.comp.set_(latent.clone())
+                # else:
+
             self.has_latent = True
+
+        estimated_image = self.comp.get_image_tensor()
+
         l1 = super().get_loss(img.get_latent_tensor(), img) / 2
         l2 = self.direct_loss.get_loss(input, img) / 10
         return l1 + l2
diff --git a/src/pytti/LossAug/LossOrchestratorClass.py b/src/pytti/LossAug/LossOrchestratorClass.py
@@ -33,7 +33,7 @@ def build_loss(weight_name, weight, name, img, pil_target):
         f"{weight_name} {name}:{weight}",
         img.image_shape,
         pil_target,
-        img_model=img,  # type(img)
+        # img_model=img,  # type(img)
     )
     out.set_enabled(pil_target is not None)
     return out
diff --git a/src/pytti/LossAug/MSELossClass.py b/src/pytti/LossAug/MSELossClass.py
@@ -43,7 +43,7 @@ def __init__(
 
     @classmethod
     def default_comp(cls, img_model=None, *args, **kargs):
-        # logger.debug("default_comp")
+        logger.debug("default_comp")
         # logger.debug(type(img_model))
         # device = kargs.get("device", "cuda") if torch.cuda.is_available() else "cpu"
         # if img_model is None:
diff --git a/src/pytti/image_models/deep_image_prior.py b/src/pytti/image_models/deep_image_prior.py
@@ -131,7 +131,10 @@ def __init__(
         # )
 
     # def get_image_tensor(self):
-    def decode_tensor(self):
+    def decode_tensor(self, input_latent=None):
+        """
+        Generates the image tensor from the attached DIP representation
+        """
         with torch.cuda.amp.autocast():
             # out = net(net_input_noised * input_scale).float()
             # logger.debug(self.net)
@@ -199,9 +202,12 @@ def encode_random(self):
 
     @classmethod
     def get_preferred_loss(cls):
-        from pytti.LossAug.LatentLossClass import LatentLoss
+        from pytti.LossAug.LatentLossClass import LatentLoss, LatentLossDIP
+
+        return LatentLossDIP  # LatentLoss
 
-        return LatentLoss
+        # it'll be stupid complicated, but I could put a closure in here...
+        # yeah no fuck that. I'm not adding complexity to enable deep image. I need to simplify how loss stuff works FIRST.
 
     def make_latent(self, pil_image):
         """
@@ -233,7 +239,7 @@ def default_comp(*args, **kargs):
 
     def encode_image(self, pil_image, device="cuda"):
         """
-        Encodes the image into a tensor.
+        Fits the attached DIP model representation to the input pil_image.
 
         :param pil_image: The image to encode
         :param smart_encode: If True, the pallet will be optimized to match the image, defaults to True
@@ -262,3 +268,19 @@ def encode_image(self, pil_image, device="cuda"):
         )
         # why is there a magic number here?
         guide.run_steps(self.image_encode_steps, [], [], [mse])
+
+
+##############################################################################################################################
+
+# round three
+
+# gonna implement this the way that makes sense to me, and then see if I can't square-peg-round-hole it
+class DipSimpleLatentLoss(nn.Module):
+    def __init__(
+        self,
+        net,
+        image_shape,
+        pil_image=None,
+    ):
+        super().__init__()
+        self.net = net
diff --git a/src/pytti/image_models/ema.py b/src/pytti/image_models/ema.py
@@ -143,12 +143,14 @@ def average(self):
     def set_(self, d):
         if isinstance(d, torch.Tensor):
             logger.debug(self._container)
+            logger.debug(d.shape)
 
         d_ = d
         if isinstance(d, EMAParametersDict):
             d_ = d._container
         logger.debug(type(d_))
-        logger.debug(d_.shape)  # fuck it
+        # logger.debug(d_.shape)  # fuck it
+        logger.debug(type(self._container))
         for k, v in d_.items():
             self._container[k].set_(v)
             # self._container[k].tensor.set_(v)
diff --git a/src/pytti/workhorse.py b/src/pytti/workhorse.py
@@ -394,7 +394,12 @@ def do_run():
         loss_augs.extend(
             type(img)
             .get_preferred_loss()
-            .TargetImage(p.strip(), img.image_shape, is_path=True, img_model=type(img))
+            .TargetImage(
+                p.strip(),
+                img.image_shape,
+                is_path=True,
+                # img_model=type(img)
+            )
             for p in params.direct_image_prompts.split("|")
             if p.strip()
         )

Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@ def build_loss(weight_name, weight, name, img, pil_target):`
`33`	`33`	`f"{weight_name} {name}:{weight}",`
`34`	`34`	`img.image_shape,`
`35`	`35`	`pil_target,`
`36`		`- img_model=img, # type(img)`
	`36`	`+ # img_model=img, # type(img)`
`37`	`37`	`)`
`38`	`38`	`out.set_enabled(pil_target is not None)`
`39`	`39`	`return out`