Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions vm/devices/net/gdma_defs/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,8 @@ pub struct HwcRxOobFlags {
pub const DRIVER_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG: u64 = 0x08;
pub const DRIVER_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT: u64 = 0x20;
pub const DRIVER_CAP_FLAG_1_HW_VPORT_LINK_AWARE: u64 = 0x40;
pub const DRIVER_CAP_FLAG_1_SELF_RESET_ON_EQE_NOTIFICATION: u64 = 0x4000;
pub const DRIVER_CAP_FLAG_1_VTL2_REVOKE_SUB_ON_RESET_EQE: u64 = 0x10000;

#[repr(C)]
#[derive(Debug, IntoBytes, Immutable, KnownLayout, FromBytes)]
Expand All @@ -478,6 +480,9 @@ pub struct GdmaVerifyVerReq {
pub os_ver_str4: [u8; 128],
}

pub const GDMA_PF_CAP_FLAG_1_QUERY_HWC_TIMEOUT: u64 = 0x08;
pub const GDMA_PF_CAP_FLAG_1_EQE_REQUEST_VF_SELF_RESET: u64 = 0x80;

#[repr(C)]
#[derive(Debug, IntoBytes, Immutable, KnownLayout, FromBytes)]
pub struct GdmaVerifyVerResp {
Expand Down
57 changes: 53 additions & 4 deletions vm/devices/net/mana_driver/src/gdma_driver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@ use futures::FutureExt;
use gdma_defs::Cqe;
use gdma_defs::DRIVER_CAP_FLAG_1_HW_VPORT_LINK_AWARE;
use gdma_defs::DRIVER_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG;
use gdma_defs::DRIVER_CAP_FLAG_1_SELF_RESET_ON_EQE_NOTIFICATION;
use gdma_defs::DRIVER_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT;
use gdma_defs::DRIVER_CAP_FLAG_1_VTL2_REVOKE_SUB_ON_RESET_EQE;
use gdma_defs::EqeDataReconfig;
use gdma_defs::EstablishHwc;
use gdma_defs::GDMA_EQE_COMPLETION;
Expand All @@ -29,6 +31,8 @@ use gdma_defs::GDMA_EQE_HWC_RECONFIG_VF;
use gdma_defs::GDMA_EQE_TEST_EVENT;
use gdma_defs::GDMA_MESSAGE_V1;
use gdma_defs::GDMA_PAGE_TYPE_4K;
use gdma_defs::GDMA_PF_CAP_FLAG_1_EQE_REQUEST_VF_SELF_RESET;
use gdma_defs::GDMA_PF_CAP_FLAG_1_QUERY_HWC_TIMEOUT;
use gdma_defs::GDMA_STANDARD_HEADER_TYPE;
use gdma_defs::GdmaChangeMsixVectorIndexForEq;
use gdma_defs::GdmaCreateDmaRegionReq;
Expand Down Expand Up @@ -1017,7 +1021,10 @@ impl<T: DeviceBacking> GdmaDriver<T> {
}
GDMA_EQE_HWC_RECONFIG_VF => {
// No data is supplied for VF reconfiguration events.
// HWC is no longer responding after this event, setting the
// timeout to zero to skip waiting on responses during teardown.
tracing::info!("HWC VF reconfiguration event");
self.hwc_timeout_in_ms = 0;
self.vf_reconfiguration_pending = true;
}
ty => tracing::error!(ty, "unknown eq event"),
Expand Down Expand Up @@ -1074,12 +1081,21 @@ impl<T: DeviceBacking> GdmaDriver<T> {
}
}

// Early exit with no eqe found if timeout occurs (or hwc_timeout_in_ms is 0).
if eqe_wait_result.elapsed >= self.hwc_timeout_in_ms as u128 {
eqe_wait_result.eqe_found = false;
break eqe_wait_result;
}

// Wait for an interrupt.
eqe_wait_result.interrupt_wait_count += 1;
let ms_wait = (HWC_INTERRUPT_POLL_WAIT_MIN_MS
* 2u32.pow(eqe_wait_result.interrupt_wait_count - 1))
.min(HWC_INTERRUPT_POLL_WAIT_MAX_MS)
.min(self.hwc_timeout_in_ms - eqe_wait_result.elapsed as u32);
.min(
self.hwc_timeout_in_ms
.saturating_sub(eqe_wait_result.elapsed as u32),
);
let before_wait = std::time::Instant::now();
eqe_wait_result.last_wait_result = Self::wait_for_hwc_interrupt(
self.interrupts[0].as_mut().unwrap(),
Expand Down Expand Up @@ -1132,8 +1148,16 @@ impl<T: DeviceBacking> GdmaDriver<T> {
_ => "response received with delay",
}
);
self.report_hwc_timeout(wait_failed, interrupt_loss, eqe_wait_result.elapsed as u32)
// When hwc_timeout_in_ms is 0 (ex: after EQE 135), skip reporting to avoid a spin-wait
// on an unresponsive SoC.
if self.hwc_timeout_in_ms > 0 {
self.report_hwc_timeout(
wait_failed,
interrupt_loss,
eqe_wait_result.elapsed as u32,
)
.await;
}
if !wait_failed && eqe_wait_result.elapsed > self.hwc_warning_time_in_ms as u128 {
// Increase warning threshold after each delay warning occurrence.
self.hwc_warning_time_in_ms += HWC_WARNING_INCREASE_IN_MS;
Expand Down Expand Up @@ -1232,7 +1256,9 @@ impl<T: DeviceBacking> GdmaDriver<T> {
protocol_ver_max: 1,
gd_drv_cap_flags1: DRIVER_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT
| DRIVER_CAP_FLAG_1_HW_VPORT_LINK_AWARE
| DRIVER_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG,
| DRIVER_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG
| DRIVER_CAP_FLAG_1_SELF_RESET_ON_EQE_NOTIFICATION
| DRIVER_CAP_FLAG_1_VTL2_REVOKE_SUB_ON_RESET_EQE,
..FromZeros::new_zeroed()
},
)
Expand All @@ -1241,6 +1267,18 @@ impl<T: DeviceBacking> GdmaDriver<T> {
if resp.gdma_protocol_ver != 1 {
anyhow::bail!("invalid protocol version");
}

// Logging the SoC / PF capabilities
let query_hwc_timeout = resp.pf_cap_flags1 & GDMA_PF_CAP_FLAG_1_QUERY_HWC_TIMEOUT != 0;
let eqe_request_vf_self_reset =
resp.pf_cap_flags1 & GDMA_PF_CAP_FLAG_1_EQE_REQUEST_VF_SELF_RESET != 0;
tracing::info!(
pf_cap_flags1 = resp.pf_cap_flags1,
query_hwc_timeout,
eqe_request_vf_self_reset,
"physical function capability flags",
);

Ok(())
}

Expand All @@ -1267,7 +1305,7 @@ impl<T: DeviceBacking> GdmaDriver<T> {
}

pub async fn deregister_device(&mut self, dev_id: GdmaDevId) -> anyhow::Result<()> {
self.hwc_timeout_in_ms = HWC_TIMEOUT_FOR_SHUTDOWN_IN_MS;
self.hwc_timeout_in_ms = self.hwc_timeout_in_ms.min(HWC_TIMEOUT_FOR_SHUTDOWN_IN_MS);
self.request(GdmaRequestType::GDMA_DEREGISTER_DEVICE.0, dev_id, ())
.await
}
Expand Down Expand Up @@ -1443,3 +1481,14 @@ impl<T: DeviceBacking> GdmaDriver<T> {
.await
}
}

#[cfg(test)]
impl<T: DeviceBacking> GdmaDriver<T> {
pub(crate) fn hwc_timeout_in_ms(&self) -> u32 {
self.hwc_timeout_in_ms
}

pub(crate) fn hwc_failure(&self) -> bool {
self.hwc_failure
}
}
46 changes: 45 additions & 1 deletion vm/devices/net/mana_driver/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -231,12 +231,56 @@ async fn test_gdma_reconfig_vf(driver: DefaultDriver) {
!gdma.get_vf_reconfiguration_pending(),
"vf_reconfiguration_pending should be false"
);
assert_ne!(
gdma.hwc_timeout_in_ms(),
0,
"timeout should be non-zero before EQE 135"
);
assert!(
!gdma.hwc_failure(),
"hwc_failure should be false before EQE 135"
);

// Get the device ID while HWC is still alive (needed for deregister later).
let dev_id = gdma
.list_devices()
.await
.unwrap()
.iter()
.copied()
.find(|dev_id| dev_id.ty == GdmaDevType::GDMA_DEVICE_MANA)
.unwrap();

// Trigger the reconfig event
// Trigger the reconfig event (EQE 135).
gdma.generate_reconfig_vf_event().await.unwrap();
gdma.process_all_eqs();

assert!(
gdma.get_vf_reconfiguration_pending(),
"vf_reconfiguration_pending should be true after reconfig event"
);
assert_eq!(
gdma.hwc_timeout_in_ms(),
0,
"timeout should be zero after EQE 135"
);

// Deregister should preserve the zero timeout (via min) and set hwc_failure.
let deregister_result = gdma.deregister_device(dev_id).await;
let err =
deregister_result.expect_err("deregister_device should time out or fail after EQE 135");
let err_msg = format!("{err:#}");
assert!(
err_msg.contains("MANA request timed out"),
"unexpected error: {err_msg}"
);
assert_eq!(
gdma.hwc_timeout_in_ms(),
0,
"deregister_device should not raise timeout above zero after EQE 135"
);
assert!(
gdma.hwc_failure(),
"hwc_failure should be true after deregister_device times out post-EQE 135"
);
}
Loading