-
Notifications
You must be signed in to change notification settings - Fork 121
Support Configurable Large Messages (> 1 MiB) via Zero-Copy Vectored I/O #198
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
ddc386c
c263028
72acb69
8005ed3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,6 +16,7 @@ package fuse | |
|
|
||
| import ( | ||
| "context" | ||
| "errors" | ||
| "fmt" | ||
| "io" | ||
| "log" | ||
|
|
@@ -80,6 +81,8 @@ type Connection struct { | |
| // Freelists, serviced by freelists.go. | ||
| inMessages freelist.Freelist // GUARDED_BY(mu) | ||
| outMessages freelist.Freelist // GUARDED_BY(mu) | ||
|
|
||
| inMessageSize int | ||
| } | ||
|
|
||
| // State that is maintained for each in-flight op. This is stuffed into the | ||
|
|
@@ -121,6 +124,12 @@ func newConnection( | |
| cancelFuncs: make(map[uint64]func()), | ||
| } | ||
|
|
||
| maxPayload := max(buffer.MaxReadSize, buffer.MaxWriteSize) | ||
| if cfg.MaxMessageSize > 0 { | ||
| maxPayload = max(maxPayload, int(cfg.MaxMessageSize)) | ||
| } | ||
| c.inMessageSize = maxPayload + buffer.GetPageSize() | ||
|
|
||
| // Initialize. | ||
| if err := c.Init(); err != nil { | ||
| c.close() | ||
|
|
@@ -172,7 +181,9 @@ func (c *Connection) Init() error { | |
| // Respond to the init op. | ||
| initOp.Library = c.protocol | ||
| initOp.MaxReadahead = maxReadahead | ||
| initOp.MaxWrite = buffer.MaxWriteSize | ||
|
|
||
| maxPayload := c.inMessageSize - buffer.GetPageSize() | ||
| initOp.MaxWrite = uint32(maxPayload) | ||
|
|
||
| initOp.Flags = 0 | ||
|
|
||
|
|
@@ -190,7 +201,6 @@ func (c *Connection) Init() error { | |
| // payload. It applies to both requests and replies, and does not include | ||
| // the extra 1 page for the FUSE header and the "args" struct. We set it to | ||
| // the max of our message in/out payload sizes. | ||
| maxPayload := max(buffer.MaxReadSize, buffer.MaxWriteSize) | ||
| initOp.MaxPages = uint16(maxPayload / buffer.GetPageSize()) | ||
|
|
||
| // Enable writeback caching if the user hasn't asked us not to. | ||
|
|
@@ -376,6 +386,7 @@ func (c *Connection) handleInterrupt(fuseID uint64) { | |
| func (c *Connection) readMessage() (*buffer.InMessage, error) { | ||
| // Allocate a message. | ||
| m := c.getInMessage() | ||
| m.AllocBlocks(c.inMessageSize) | ||
|
|
||
| // Loop past transient errors. | ||
| for { | ||
|
|
@@ -389,15 +400,11 @@ func (c *Connection) readMessage() (*buffer.InMessage, error) { | |
| // * EINTR means we should try again. (This seems to happen often on | ||
| // OS X, cf. http://golang.org/issue/11180) | ||
| // | ||
| if pe, ok := err.(*os.PathError); ok { | ||
| switch pe.Err { | ||
| case syscall.ENODEV: | ||
| err = io.EOF | ||
|
|
||
| case syscall.EINTR: | ||
| err = nil | ||
| continue | ||
| } | ||
| if errors.Is(err, syscall.ENODEV) { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we are removing a typecasting here? Are these changes intentiontal? If yes, how did they work earlier? |
||
| err = io.EOF | ||
| } else if errors.Is(err, syscall.EINTR) { | ||
| err = nil | ||
| continue | ||
| } | ||
|
|
||
| if err != nil { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -145,13 +145,14 @@ func convertInMessage( | |
| } | ||
|
|
||
| entries := make([]fuseops.BatchForgetEntry, 0, in.Count) | ||
| for i := uint32(0); i < in.Count; i++ { | ||
| type entry fusekernel.BatchForgetEntryIn | ||
| ein := (*entry)(inMsg.Consume(unsafe.Sizeof(entry{}))) | ||
| if ein == nil { | ||
| return nil, errors.New("Corrupt OpBatchForget") | ||
| } | ||
| entrySize := unsafe.Sizeof(fusekernel.BatchForgetEntryIn{}) | ||
| buf := inMsg.ConsumeBytes(uintptr(in.Count) * entrySize) | ||
| if len(buf) < int(in.Count*uint32(entrySize)) { | ||
| return nil, errors.New("Corrupt OpBatchForget") | ||
| } | ||
|
|
||
| for i := uint32(0); i < in.Count; i++ { | ||
| ein := (*fusekernel.BatchForgetEntryIn)(unsafe.Pointer(&buf[uintptr(i)*entrySize])) | ||
| entries = append(entries, fuseops.BatchForgetEntry{ | ||
| Inode: fuseops.InodeID(ein.Inode), | ||
| N: ein.Nlookup, | ||
|
|
@@ -396,7 +397,11 @@ func convertInMessage( | |
| }, | ||
| } | ||
| // Use part of the incoming message storage as the read buffer. | ||
| to.Dst = inMsg.GetFree(int(in.Size)) | ||
| if config.EnableVectoredReads && int(in.Size) > buffer.MiBPlusPageSize { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Brainstorming a bit here. why do we need to support both vectoredReads and non vectoredReads. Can we just pass 2-D array always. It would be a minor change on the GCSFuse side. How big of a change will it be on GCSFuse side? I am guessing we can just pick the first block from the array and pass it downstream when messageSize is 1MB? |
||
| to.DstBufs = inMsg.GetFreeVector(int(in.Size)) | ||
| } else { | ||
| to.Dst = inMsg.GetFree(int(in.Size)) | ||
| } | ||
| o = to | ||
|
|
||
| case fusekernel.OpReaddir: | ||
|
|
@@ -498,16 +503,31 @@ func convertInMessage( | |
| return nil, errors.New("Corrupt OpWrite") | ||
| } | ||
|
|
||
| buf := inMsg.ConsumeBytes(inMsg.Len()) | ||
| if len(buf) < int(in.Size) { | ||
| return nil, errors.New("Corrupt OpWrite") | ||
| var buf []byte | ||
| var dataBlocks [][]byte | ||
|
|
||
| if config.EnableVectoredWrites && inMsg.Len() > uintptr(buffer.MiBPlusPageSize) { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. by moving everything to vectoredReads/writes we need not do if-else every where. the code becomes much simpler. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. reduces the number of configs too. |
||
| dataBlocks = inMsg.ConsumeVector(inMsg.Len()) | ||
| var totalLen int | ||
| for _, b := range dataBlocks { | ||
| totalLen += len(b) | ||
| } | ||
| if totalLen < int(in.Size) { | ||
| return nil, errors.New("Corrupt OpWrite") | ||
| } | ||
| } else { | ||
| buf = inMsg.ConsumeBytes(inMsg.Len()) | ||
| if len(buf) < int(in.Size) { | ||
| return nil, errors.New("Corrupt OpWrite") | ||
| } | ||
| } | ||
|
|
||
| o = &fuseops.WriteFileOp{ | ||
| Inode: fuseops.InodeID(inMsg.Header().Nodeid), | ||
| Handle: fuseops.HandleID(in.Fh), | ||
| Data: buf, | ||
| Offset: int64(in.Offset), | ||
| Inode: fuseops.InodeID(inMsg.Header().Nodeid), | ||
| Handle: fuseops.HandleID(in.Fh), | ||
| Data: buf, | ||
| DataBlocks: dataBlocks, | ||
| Offset: int64(in.Offset), | ||
| OpContext: fuseops.OpContext{ | ||
| FuseID: inMsg.Header().Unique, | ||
| Pid: inMsg.Header().Pid, | ||
|
|
@@ -933,14 +953,16 @@ func (c *Connection) kernelResponseForOp( | |
| case *fuseops.ReadFileOp: | ||
| if o.Data != nil { | ||
| m.Append(o.Data...) | ||
| } else if o.DstBufs != nil { | ||
| m.Append(o.DstBufs...) | ||
| } else { | ||
| m.Append(o.Dst) | ||
| } | ||
| m.ShrinkTo(buffer.OutMessageHeaderSize + o.BytesRead) | ||
|
|
||
| case *fuseops.WriteFileOp: | ||
| out := (*fusekernel.WriteOut)(m.Grow(int(unsafe.Sizeof(fusekernel.WriteOut{})))) | ||
| out.Size = uint32(len(o.Data)) | ||
| out.Size = uint32(o.TotalSize()) | ||
|
|
||
| case *fuseops.SyncFileOp: | ||
| // Empty response | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do we anticipate different sizes for read and write. if not can we have just one variable which tells size of the request for both reads and writes.