Skip to content

Commit e355651

Browse files
committed
PG needs to boot even when a Zombie so manual intervention is possible.
1 parent 93b3d7e commit e355651

File tree

1 file changed

+18
-19
lines changed

1 file changed

+18
-19
lines changed

pkg/flypg/node.go

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -120,34 +120,30 @@ func (n *Node) Init(ctx context.Context) error {
120120
return err
121121
}
122122

123-
// Attempt to re-introduce zombie node back into the cluster.
123+
// Attempt to re-introduce zombie back into the cluster.
124124
if ZombieLockExists() {
125125
fmt.Println("Zombie lock detected")
126126
zHostname, err := readZombieLock()
127127
if err != nil {
128128
return fmt.Errorf("failed to read zombie lock: %s", zHostname)
129129
}
130130

131-
if zHostname == "" {
131+
if zHostname != "" {
132+
if err := n.RepMgr.rejoinCluster(zHostname); err != nil {
133+
return fmt.Errorf("failed to rejoin cluster: %s", err)
134+
}
135+
136+
if err := removeZombieLock(); err != nil {
137+
return fmt.Errorf("failed to remove zombie lock: %s", err)
138+
}
139+
140+
// Ensure the single instance created with the --force-rewind process is cleaned up properly.
141+
utils.RunCommand("pg_ctl -D /data/postgresql/ stop")
142+
} else {
132143
// TODO - Provide link to documention on how to address this
133144
fmt.Println("Zombie lock does not contain a valid hostname!")
134145
fmt.Println("This likely means that we were unable to build a consensus on who the real primary is.")
135-
fmt.Println("If you feel like this is a mistake, you can force a retry by deleting the zombie.lock file.")
136-
fmt.Println("Sleeping for 2 minutes.")
137-
time.Sleep(2 * time.Minute)
138-
return fmt.Errorf("unrecoverable zombie")
139146
}
140-
141-
if err := n.RepMgr.rejoinCluster(zHostname); err != nil {
142-
return fmt.Errorf("failed to rejoin cluster: %s", err)
143-
}
144-
145-
if err := removeZombieLock(); err != nil {
146-
return fmt.Errorf("failed to remove zombie lock: %s", err)
147-
}
148-
149-
// Ensure the single instance created with the --force-rewind process is cleaned up properly.
150-
utils.RunCommand("pg_ctl -D /data/postgresql/ stop")
151147
}
152148

153149
store, err := state.NewStore()
@@ -201,8 +197,11 @@ func (n *Node) Init(ctx context.Context) error {
201197
// PostInit are operations that should be executed against a running Postgres on boot.
202198
func (n *Node) PostInit(ctx context.Context) error {
203199
if ZombieLockExists() {
204-
time.Sleep(30 * time.Second)
205-
return fmt.Errorf("unable to continue with PostInit while a zombie. please restart the machine using `fly machine restart %s --app %s`", os.Getenv("FLY_ALLOC_ID"), n.AppName)
200+
fmt.Println("If you feel like this is a mistake, you can force a retry by deleting the zombie.lock file")
201+
fmt.Println("Sleeping for 2 minutes.")
202+
time.Sleep(2 * time.Minute)
203+
204+
return fmt.Errorf("unrecoverable zombie")
206205
}
207206

208207
// Ensure local PG is up before establishing connection with consul.

0 commit comments

Comments
 (0)