/* Open up the device */
ret = rndis_filter_open(nvdev); if (ret != 0) {
netdev_err(net, "unable to open device (ret %d).\n", ret); return ret;
}
rdev = nvdev->extension; if (!rdev->link_state) {
netif_carrier_on(net);
netvsc_tx_enable(nvdev, net);
}
if (vf_netdev) { /* Setting synthetic device up transparently sets * slave as up. If open fails, then slave will be * still be offline (and not used).
*/
ret = dev_open(vf_netdev, NULL); if (ret)
netdev_warn(net, "unable to open slave: %s: %d\n",
vf_netdev->name, ret);
} return 0;
}
/* Ensure pending bytes in ring are read */ for (;;) {
u32 aread = 0;
for (i = 0; i < nvdev->num_chn; i++) { struct vmbus_channel *chn
= nvdev->chan_table[i].channel;
if (!chn) continue;
/* make sure receive not running now */
napi_synchronize(&nvdev->chan_table[i].napi);
aread = hv_get_bytes_to_read(&chn->inbound); if (aread) break;
aread = hv_get_bytes_to_read(&chn->outbound); if (aread) break;
}
if (aread == 0) return 0;
if (++retry > RETRY_MAX) return -ETIMEDOUT;
usleep_range(RETRY_US_LO, RETRY_US_HI);
}
}
staticvoid netvsc_tx_disable(struct netvsc_device *nvscdev, struct net_device *ndev)
{ if (nvscdev) {
nvscdev->tx_disable = true;
virt_wmb(); /* ensure txq will not wake up after stop */
}
/* If queue index changed record the new value */ if (q_idx != old_idx &&
sk && sk_fullsock(sk) && rcu_access_pointer(sk->sk_dst_cache))
sk_tx_queue_set(sk, q_idx);
return q_idx;
}
/* * Select queue for transmit. * * If a valid queue has already been assigned, then use that. * Otherwise compute tx queue based on hash and the send table. * * This is basically similar to default (netdev_pick_tx) with the added step * of using the host send_table when no other queue has been assigned. * * TODO support XPS - but get_xps_queue not exported
*/ static u16 netvsc_pick_tx(struct net_device *ndev, struct sk_buff *skb)
{ int q_idx = sk_tx_queue_get(skb->sk);
if (q_idx < 0 || skb->ooo_okay || q_idx >= ndev->real_num_tx_queues) { /* If forwarding a packet, we use the recorded queue when * available for better cache locality.
*/ if (skb_rx_queue_recorded(skb))
q_idx = skb_get_rx_queue(skb); else
q_idx = netvsc_get_tx_queue(ndev, skb, q_idx);
}
/* Record the queue selected by VF so that it can be * used for common case where VF has more queues than * the synthetic device.
*/
qdisc_skb_cb(skb)->slave_dev_queue_mapping = txq;
} else {
txq = netvsc_pick_tx(ndev, skb);
}
rcu_read_unlock();
while (txq >= ndev->real_num_tx_queues)
txq -= ndev->real_num_tx_queues;
/* If VF is present and up then redirect packets to it. * Skip the VF if it is marked down or has no carrier. * If netpoll is in uses, then VF can not be used either.
*/
vf_netdev = rcu_dereference_bh(net_device_ctx->vf_netdev); if (vf_netdev && netif_running(vf_netdev) &&
netif_carrier_ok(vf_netdev) && !netpoll_tx_running(net) &&
net_device_ctx->data_path_is_vf) return netvsc_vf_xmit(net, vf_netdev, skb);
/* We will atmost need two pages to describe the rndis * header. We can only transmit MAX_PAGE_BUFFER_COUNT number * of pages in a single packet. If skb is scattered around * more pages we try linearizing it.
*/
num_data_pgs = netvsc_get_slots(skb) + 2;
if (unlikely(num_data_pgs > MAX_PAGE_BUFFER_COUNT)) {
++net_device_ctx->eth_stats.tx_scattered;
/* * Place the rndis header in the skb head room and * the skb->cb will be used for hv_netvsc_packet * structure.
*/
ret = skb_cow_head(skb, RNDIS_AND_PPI_SIZE); if (ret) goto no_memory;
/* Use the skb control buffer for building up the packet */
BUILD_BUG_ON(sizeof(struct hv_netvsc_packet) >
sizeof_field(struct sk_buff, cb));
packet = (struct hv_netvsc_packet *)skb->cb;
/* When using AF_PACKET we need to drop VLAN header from * the frame and update the SKB to allow the HOST OS * to transmit the 802.1Q packet
*/ if (skb->protocol == htons(ETH_P_8021Q)) {
u16 vlan_tci;
skb_reset_mac_header(skb); if (eth_type_vlan(eth_hdr(skb)->h_proto)) { if (unlikely(__skb_vlan_pop(skb, &vlan_tci) != 0)) {
++net_device_ctx->eth_stats.vlan_error; goto drop;
}
if (ipv6_hdr(skb)->nexthdr == IPPROTO_TCP)
csum_info->transmit.tcp_checksum = 1; else
csum_info->transmit.udp_checksum = 1;
}
} else { /* Can't do offload of this type of checksum */ if (skb_checksum_help(skb)) goto drop;
}
}
/* Start filling in the page buffers with the rndis hdr */
rndis_msg->msg_len += rndis_msg_size;
packet->total_data_buflen = rndis_msg->msg_len;
packet->page_buf_cnt = init_page_array(rndis_msg, rndis_msg_size,
skb, packet, pb);
/* timestamp packet in software */
skb_tx_timestamp(skb);
ret = netvsc_send(net, packet, rndis_msg, pb, skb, xdp_tx); if (likely(ret == 0)) return NETDEV_TX_OK;
if (ret == -EAGAIN) {
++net_device_ctx->eth_stats.tx_busy; return NETDEV_TX_BUSY;
}
if (ret == -ENOSPC)
++net_device_ctx->eth_stats.tx_no_space;
/* Ensure the packet is big enough to access its fields */ if (resp->msg_len - RNDIS_HEADER_SIZE < sizeof(struct rndis_indicate_status)) {
netdev_err(net, "invalid rndis_indicate_status packet, len: %u\n",
resp->msg_len); return;
}
/* Copy the RNDIS indicate status into nvchan->recv_buf */
memcpy(indicate, data + RNDIS_HEADER_SIZE, sizeof(*indicate));
/* Update the physical link speed when changing to another vSwitch */ if (indicate->status == RNDIS_STATUS_LINK_SPEED_CHANGE) {
u32 speed;
/* Validate status_buf_offset and status_buflen. * * Certain (pre-Fe) implementations of Hyper-V's vSwitch didn't account * for the status buffer field in resp->msg_len; perform the validation * using data_buflen (>= resp->msg_len).
*/ if (indicate->status_buflen < sizeof(speed) ||
indicate->status_buf_offset < sizeof(*indicate) ||
data_buflen - RNDIS_HEADER_SIZE < indicate->status_buf_offset ||
data_buflen - RNDIS_HEADER_SIZE - indicate->status_buf_offset
< indicate->status_buflen) {
netdev_err(net, "invalid rndis_indicate_status packet\n"); return;
}
/* Copy to skb. This copy is needed here since the memory * pointed by hv_netvsc_packet cannot be deallocated.
*/ for (i = 0; i < nvchan->rsc.cnt; i++)
skb_put_data(skb, nvchan->rsc.data[i],
nvchan->rsc.len[i]);
}
skb->protocol = eth_type_trans(skb, net);
/* skb is already created with CHECKSUM_NONE */
skb_checksum_none_assert(skb);
/* Incoming packets may have IP header checksum verified by the host. * They may not have IP header checksum computed after coalescing. * We compute it here if the flags are set, because on Linux, the IP * checksum is always checked.
*/ if ((ppi_flags & NVSC_RSC_CSUM_INFO) && csum_info->receive.ip_checksum_value_invalid &&
csum_info->receive.ip_checksum_succeeded &&
skb->protocol == htons(ETH_P_IP)) { /* Check that there is enough space to hold the IP header. */ if (skb_headlen(skb) < sizeof(struct iphdr)) {
kfree_skb(skb); return NULL;
}
netvsc_comp_ipcsum(skb);
}
/* Do L4 checksum offload if enabled and present. */ if ((ppi_flags & NVSC_RSC_CSUM_INFO) && (net->features & NETIF_F_RXCSUM)) { if (csum_info->receive.tcp_checksum_succeeded ||
csum_info->receive.udp_checksum_succeeded)
skb->ip_summed = CHECKSUM_UNNECESSARY;
}
/* Allocate a skb - TODO direct I/O to pages? */
skb = netvsc_alloc_recv_skb(net, nvchan, &xdp);
if (unlikely(!skb)) {
++net_device_ctx->eth_stats.rx_no_memory; return NVSP_STAT_FAIL;
}
skb_record_rx_queue(skb, q_idx);
/* * Even if injecting the packet, record the statistics * on the synthetic device because modifying the VF device * statistics will not work correctly.
*/
u64_stats_update_begin(&rx_stats->syncp); if (act == XDP_TX)
rx_stats->xdp_tx++;
/* We do not support separate count for rx, tx, or other */ if (count == 0 ||
channels->rx_count || channels->tx_count || channels->other_count) return -EINVAL;
if (!nvdev || nvdev->destroy) return -ENODEV;
if (nvdev->nvsp_version < NVSP_PROTOCOL_VERSION_5) return -EINVAL;
if (count > nvdev->max_chn) return -EINVAL;
orig = nvdev->num_chn;
device_info = netvsc_devinfo_get(nvdev);
if (!device_info) return -ENOMEM;
device_info->num_chn = count;
ret = netvsc_detach(net, nvdev); if (ret) goto out;
ret = netvsc_attach(net, device_info); if (ret) {
device_info->num_chn = orig; if (netvsc_attach(net, device_info))
netdev_err(net, "restoring channel setting failed\n");
}
/* Hyper-V RNDIS protocol does not have ring in the HW sense. * It does have pre-allocated receive area which is divided into sections.
*/ staticvoid __netvsc_get_ringparam(struct netvsc_device *nvdev, struct ethtool_ringparam *ring)
{
u32 max_buf_size;
switch (event->event) { /* Only the following events are possible due to the check in * netvsc_linkstatus_callback()
*/ case RNDIS_STATUS_MEDIA_CONNECT: if (rdev->link_state) {
rdev->link_state = false;
netif_carrier_on(net);
netvsc_tx_enable(net_device, net);
} else {
__netdev_notify_peers(net);
}
kfree(event); break; case RNDIS_STATUS_MEDIA_DISCONNECT: if (!rdev->link_state) {
rdev->link_state = true;
netif_carrier_off(net);
netvsc_tx_disable(net_device, net);
}
kfree(event); break; case RNDIS_STATUS_NETWORK_CHANGE: /* Only makes sense if carrier is present */ if (!rdev->link_state) {
rdev->link_state = true;
netif_carrier_off(net);
netvsc_tx_disable(net_device, net);
event->event = RNDIS_STATUS_MEDIA_CONNECT;
spin_lock_irqsave(&ndev_ctx->lock, flags);
list_add(&event->list, &ndev_ctx->reconfig_events);
spin_unlock_irqrestore(&ndev_ctx->lock, flags);
reschedule = true;
} break;
}
rtnl_unlock();
/* link_watch only sends one notification with current state per * second, handle next reconfig event in 2 seconds.
*/ if (reschedule)
schedule_delayed_work(&ndev_ctx->dwork, LINKCHANGE_INT);
staticint netvsc_vf_join(struct net_device *vf_netdev, struct net_device *ndev, int context)
{ struct net_device_context *ndev_ctx = netdev_priv(ndev); int ret;
ret = netdev_rx_handler_register(vf_netdev,
netvsc_vf_handle_frame, ndev); if (ret != 0) {
netdev_err(vf_netdev, "can not register netvsc VF receive handler (err = %d)\n",
ret); goto rx_handler_failed;
}
ret = netdev_master_upper_dev_link(vf_netdev, ndev,
NULL, NULL, NULL); if (ret != 0) {
netdev_err(vf_netdev, "can not set master device %s (err = %d)\n",
ndev->name, ret); goto upper_link_failed;
}
/* If this registration is called from probe context vf_takeover * is taken care of later in probe itself.
*/ if (context == VF_REG_IN_NOTIFIER)
schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT);
call_netdevice_notifiers(NETDEV_JOIN, vf_netdev);
netdev_info(vf_netdev, "joined to %s\n", ndev->name); return 0;
staticvoid __netvsc_vf_setup(struct net_device *ndev, struct net_device *vf_netdev)
{ int ret;
/* Align MTU of VF with master */
ret = dev_set_mtu(vf_netdev, ndev->mtu); if (ret)
netdev_warn(vf_netdev, "unable to change mtu to %u\n", ndev->mtu);
/* set multicast etc flags on VF */
dev_change_flags(vf_netdev, ndev->flags | IFF_SLAVE, NULL);
/* sync address list from ndev to VF */
netif_addr_lock_bh(ndev);
dev_uc_sync(vf_netdev, ndev);
dev_mc_sync(vf_netdev, ndev);
netif_addr_unlock_bh(ndev);
if (netif_running(ndev)) {
ret = dev_open(vf_netdev, NULL); if (ret)
netdev_warn(vf_netdev, "unable to open: %d\n", ret);
}
}
/* Setup VF as slave of the synthetic device. * Runs in workqueue to avoid recursion in netlink callbacks.
*/ staticvoid netvsc_vf_setup(struct work_struct *w)
{ struct net_device_context *ndev_ctx
= container_of(w, struct net_device_context, vf_takeover.work); struct net_device *ndev = hv_get_drvdata(ndev_ctx->device_ctx); struct net_device *vf_netdev;
if (!rtnl_trylock()) {
schedule_delayed_work(&ndev_ctx->vf_takeover, 0); return;
}
vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev); if (vf_netdev)
__netvsc_vf_setup(ndev, vf_netdev);
rtnl_unlock();
}
/* Find netvsc by VF serial number. * The PCI hyperv controller records the serial number as the slot kobj name.
*/ staticstruct net_device *get_netvsc_byslot(conststruct net_device *vf_netdev)
{ struct device *parent = vf_netdev->dev.parent; struct net_device_context *ndev_ctx; struct net_device *ndev; struct pci_dev *pdev;
u32 serial;
if (!parent || !dev_is_pci(parent)) return NULL; /* not a PCI device */
pdev = to_pci_dev(parent); if (!pdev->slot) {
netdev_notice(vf_netdev, "no PCI slot information\n"); return NULL;
}
/* Fallback path to check synthetic vf with help of mac addr. * Because this function can be called before vf_netdev is * initialized (NETDEV_POST_INIT) when its perm_addr has not been copied * from dev_addr, also try to match to its dev_addr. * Note: On Hyper-V and Azure, it's not possible to set a MAC address * on a VF that matches to the MAC of a unrelated NETVSC device.
*/
list_for_each_entry(ndev_ctx, &netvsc_dev_list, list) {
ndev = hv_get_drvdata(ndev_ctx->device_ctx); if (ether_addr_equal(vf_netdev->perm_addr, ndev->perm_addr) ||
ether_addr_equal(vf_netdev->dev_addr, ndev->perm_addr)) return ndev;
}
netdev_notice(vf_netdev, "no netdev found for vf serial:%u\n", serial); return NULL;
}
ndev = get_netvsc_byslot(vf_netdev); if (!ndev) return NOTIFY_DONE;
/* Set slave flag and no addrconf flag before open * to prevent IPv6 addrconf.
*/
vf_netdev->flags |= IFF_SLAVE;
vf_netdev->priv_flags |= IFF_NO_ADDRCONF; return NOTIFY_DONE;
}
/* if synthetic interface is a different namespace, * then move the VF to that namespace; join will be * done again in that context.
*/ if (!net_eq(dev_net(ndev), dev_net(vf_netdev))) {
ret = dev_change_net_namespace(vf_netdev,
dev_net(ndev), "eth%d"); if (ret)
netdev_err(vf_netdev, "could not move to same namespace as %s: %d\n",
ndev->name, ret); else
netdev_info(vf_netdev, "VF moved to namespace with: %s\n",
ndev->name); return NOTIFY_DONE;
}
/* Change the data path when VF UP/DOWN/CHANGE are detected. * * Typically a UP or DOWN event is followed by a CHANGE event, so * net_device_ctx->data_path_is_vf is used to cache the current data path * to avoid the duplicate call of netvsc_switch_datapath() and the duplicate * message. * * During hibernation, if a VF NIC driver (e.g. mlx5) preserves the network * interface, there is only the CHANGE event and no UP or DOWN event.
*/ staticint netvsc_vf_changed(struct net_device *vf_netdev, unsignedlong event)
{ struct net_device_context *net_device_ctx; struct netvsc_device *netvsc_dev; struct net_device *ndev; bool vf_is_up = false; int ret;
if (event != NETDEV_GOING_DOWN)
vf_is_up = netif_running(vf_netdev);
ndev = get_netvsc_byref(vf_netdev); if (!ndev) return NOTIFY_DONE;
net_device_ctx = netdev_priv(ndev);
netvsc_dev = rtnl_dereference(net_device_ctx->nvdev); if (!netvsc_dev) return NOTIFY_DONE;
if (net_device_ctx->data_path_is_vf == vf_is_up) return NOTIFY_OK;
if (vf_is_up && !net_device_ctx->vf_alloc) {
netdev_info(ndev, "Waiting for the VF association from host\n");
wait_for_completion(&net_device_ctx->vf_add);
}
/* In Azure, when accelerated networking in enabled, other NICs * like MANA, MLX, are configured as a bonded nic with * Netvsc(failover) NIC. For bonded NICs, the min of the max * pkt aggregate size of the members is propagated in the stack. * In order to allow these NICs (MANA/MLX) to use up to * GSO_MAX_SIZE gso packet size, we need to allow Netvsc NIC to * also support this in the guest. * This value is only increased for netvsc NIC when datapath is * switched over to the VF
*/ if (vf_is_up)
netif_set_tso_max_size(ndev, vf_netdev->tso_max_size); else
netif_set_tso_max_size(ndev, netvsc_dev->netvsc_gso_max_size);
}
/* We always need headroom for rndis header */
net->needed_headroom = RNDIS_AND_PPI_SIZE;
/* Initialize the number of queues to be 1, we may change it if more * channels are offered later.
*/
netif_set_real_num_tx_queues(net, 1);
netif_set_real_num_rx_queues(net, 1);
/* Notify the netvsc driver of the new device */
device_info = netvsc_devinfo_get(NULL);
if (!device_info) {
ret = -ENOMEM; goto devinfo_failed;
}
/* We must get rtnl lock before scheduling nvdev->subchan_work, * otherwise netvsc_subchan_work() can get rtnl lock first and wait * all subchannels to show up, but that may not happen because * netvsc_probe() can't get rtnl lock and as a result vmbus_onoffer() * -> ... -> device_add() -> ... -> __device_attach() can't get * the device lock, so all the subchannels can't be processed -- * finally netvsc_subchan_work() hangs forever. * * The rtnl lock also needs to be held before rndis_filter_device_add() * which advertises nvsp_2_vsc_capability / sriov bit, and triggers * VF NIC offering and registering. If VF NIC finished register_netdev() * earlier it may cause name based config failure.
*/
rtnl_lock();
nvdev = rndis_filter_device_add(dev, device_info); if (IS_ERR(nvdev)) {
ret = PTR_ERR(nvdev);
netdev_err(net, "unable to add netvsc device (ret %d)\n", ret); goto rndis_failed;
}
eth_hw_addr_set(net, device_info->mac_adr);
if (nvdev->num_chn > 1)
schedule_work(&nvdev->subchan_work);
/* When the hv_netvsc driver is unloaded and reloaded, the * NET_DEVICE_REGISTER for the vf device is replayed before probe * is complete. This is because register_netdevice_notifier() gets * registered before vmbus_driver_register() so that callback func * is set before probe and we don't miss events like NETDEV_POST_INIT * So, in this section we try to register the matching vf device that * is present as a netdevice, knowing that its register call is not * processed in the netvsc_netdev_notifier(as probing is progress and * get_netvsc_byslot fails).
*/
for_each_netdev(dev_net(net), vf_netdev) {
ret = check_dev_is_matching_vf(vf_netdev); if (ret != 0) continue;
if (net != get_netvsc_byslot(vf_netdev)) continue;
/* * Call to the vsc driver to let it know that the device is being * removed. Also blocks mtu and channel changes.
*/
vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev); if (vf_netdev)
netvsc_unregister_vf(vf_netdev);
if (nvdev)
rndis_filter_device_remove(dev, nvdev);
nvdev = rtnl_dereference(ndev_ctx->nvdev); if (nvdev == NULL) {
ret = -ENODEV; goto out;
}
/* Save the current config info */
ndev_ctx->saved_netvsc_dev_info = netvsc_devinfo_get(nvdev); if (!ndev_ctx->saved_netvsc_dev_info) {
ret = -ENOMEM; goto out;
}
ret = netvsc_detach(net, nvdev);
out:
rtnl_unlock();
/* Reset the data path to the netvsc NIC before re-opening the vmbus * channel. Later netvsc_netdev_event() will switch the data path to * the VF upon the UP or CHANGE event.
*/
net_device_ctx->data_path_is_vf = false;
device_info = net_device_ctx->saved_netvsc_dev_info;
/* The one and only one */ staticstruct hv_driver netvsc_drv = {
.name = KBUILD_MODNAME,
.id_table = id_table,
.probe = netvsc_probe,
.remove = netvsc_remove,
.suspend = netvsc_suspend,
.resume = netvsc_resume,
.driver = {
.probe_type = PROBE_FORCE_SYNCHRONOUS,
},
};
/* Set VF's namespace same as the synthetic NIC */ staticvoid netvsc_event_set_vf_ns(struct net_device *ndev)
{ struct net_device_context *ndev_ctx = netdev_priv(ndev); struct net_device *vf_netdev; int ret;
vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev); if (!vf_netdev) return;
if (!net_eq(dev_net(ndev), dev_net(vf_netdev))) {
ret = dev_change_net_namespace(vf_netdev, dev_net(ndev), "eth%d"); if (ret)
netdev_err(vf_netdev, "Cannot move to same namespace as %s: %d\n",
ndev->name, ret); else
netdev_info(vf_netdev, "Moved VF to namespace with: %s\n",
ndev->name);
}
}
if (!rtnl_trylock()) {
schedule_delayed_work(&ndev_ctx->vfns_work, 1); return;
}
ndev = hv_get_drvdata(ndev_ctx->device_ctx); if (!ndev) goto out;
netvsc_event_set_vf_ns(ndev);
out:
rtnl_unlock();
}
/* * On Hyper-V, every VF interface is matched with a corresponding * synthetic interface. The synthetic interface is presented first * to the guest. When the corresponding VF instance is registered, * we will take care of switching the data path.
*/ staticint netvsc_netdev_event(struct notifier_block *this, unsignedlong event, void *ptr)
{ struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); struct net_device_context *ndev_ctx; int ret = 0;
ret = check_dev_is_matching_vf(event_dev); if (ret != 0) return NOTIFY_DONE;
switch (event) { case NETDEV_POST_INIT: return netvsc_prepare_bonding(event_dev); case NETDEV_REGISTER: return netvsc_register_vf(event_dev, VF_REG_IN_NOTIFIER); case NETDEV_UNREGISTER: return netvsc_unregister_vf(event_dev); case NETDEV_UP: case NETDEV_DOWN: case NETDEV_CHANGE: case NETDEV_GOING_DOWN: return netvsc_vf_changed(event_dev, event); default: return NOTIFY_DONE;
}
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.