libpayload: xhci: Make XHCI stack usable on ARM

This patch updates the libpayload XHCI stack to run on ARM CPUs (tested
with the DWC3 controller on an Exynos5420). Firstly, it adds support for
64-byte Slot/Endpoint Context sizes. Since the existing context handling
code represented the whole device context as a C struct (whose size has
to be known at compile time), it was necessary to refactor the input and
device context structures to consist of pointers to the actual contexts
instead.

Secondly, it moves all data structures that the xHC accesses through DMA
to cache-coherent memory. With a similar rationale as in the ARM patches
for EHCI, using explicit cache maintenance functions to correctly handle
the actual transfer buffers in all cases is presumably impossible.
Instead this patch also chooses to create a DMA bounce buffer in the
XHCI stack where transfer buffers which are not already cache-coherent
will be copied to/from.

BUG=chrome-os-partner:21969
TEST=Snow/Pit/Kirby correctly boot from XHCI ports.

Change-Id: I14e82fffb43b4d52d687b65415f2e33920e088de
Signed-off-by: Julius Werner <jwerner@chromium.org>
Reviewed-on: https://chromium-review.googlesource.com/169453
Reviewed-by: Stefan Reinauer <reinauer@google.com>
diff --git a/payloads/libpayload/drivers/usb/xhci.c b/payloads/libpayload/drivers/usb/xhci.c
index 6b6b0d7..5f906b2 100644
--- a/payloads/libpayload/drivers/usb/xhci.c
+++ b/payloads/libpayload/drivers/usb/xhci.c
@@ -62,7 +62,7 @@
 	if (align < min_align)
 		align = min_align;
 	xhci_spew("Aligning %zu to %zu\n", size, align);
-	return memalign(align, size);
+	return dma_memalign(align, size);
 }
 
 void
@@ -172,12 +172,7 @@
 		goto _free_xhci;
 	}
 
-	xhci_debug("context size: %dB\n", xhci->capreg->csz ? 64 : 32);
-	if (xhci->capreg->csz) {
-		xhci_debug("Only 32B contexts are supported\n");
-		goto _free_xhci;
-	}
-
+	xhci_debug("context size: %dB\n", CTXSIZE(xhci));
 	xhci_debug("maxslots: 0x%02lx\n", xhci->capreg->MaxSlots);
 	xhci_debug("maxports: 0x%02lx\n", xhci->capreg->MaxPorts);
 	const unsigned pagesize = xhci->opreg->pagesize << 12;
@@ -188,13 +183,15 @@
 	 * structures at first and can still chicken out easily if we run out
 	 * of memory.
 	 */
-	const size_t dcbaa_size = (xhci->capreg->MaxSlots + 1) * sizeof(u64);
-	xhci->dcbaa = xhci_align(64, dcbaa_size);
-	if (!xhci->dcbaa) {
+	xhci->max_slots_en = xhci->capreg->MaxSlots & CONFIG_LP_MASK_MaxSlotsEn;
+	xhci->dcbaa = xhci_align(64, (xhci->max_slots_en + 1) * sizeof(u64));
+	xhci->dev = malloc((xhci->max_slots_en + 1) * sizeof(*xhci->dev));
+	if (!xhci->dcbaa || !xhci->dev) {
 		xhci_debug("Out of memory\n");
 		goto _free_xhci;
 	}
-	memset((void*)xhci->dcbaa, 0x00, dcbaa_size);
+	memset(xhci->dcbaa, 0x00, (xhci->max_slots_en + 1) * sizeof(u64));
+	memset(xhci->dev, 0x00, (xhci->max_slots_en + 1) * sizeof(*xhci->dev));
 
 	/*
 	 * Let dcbaa[0] point to another array of pointers, sp_ptrs.
@@ -223,10 +220,18 @@
 		xhci->dcbaa[0] = virt_to_phys(xhci->sp_ptrs);
 	}
 
+	if (dma_initialized()) {
+		xhci->dma_buffer = dma_memalign(64 * 1024, DMA_SIZE);
+		if (!xhci->dma_buffer) {
+			xhci_debug("Not enough memory for DMA bounce buffer\n");
+			goto _free_xhci_structs;
+		}
+	}
+
 	/* Now start working on the hardware */
 
 	if (xhci_wait_ready(xhci))
-		goto _free_xhci;
+		goto _free_xhci_structs;
 
 	/* TODO: Check if BIOS claims ownership (and hand over) */
 
@@ -253,6 +258,7 @@
 	free((void *)xhci->er.ring);
 	free((void *)xhci->cr.ring);
 	free(xhci->roothub);
+	free(xhci->dev);
 	free(xhci);
 _free_controller:
 	detach_controller(controller);
@@ -325,8 +331,7 @@
 		return;
 
 	/* Enable all available slots */
-	xhci->opreg->config = xhci->capreg->MaxSlots & CONFIG_LP_MASK_MaxSlotsEn;
-	xhci->max_slots_en = xhci->capreg->MaxSlots & CONFIG_LP_MASK_MaxSlotsEn;
+	xhci->opreg->config = xhci->max_slots_en;
 
 	/* Set DCBAA */
 	xhci->opreg->dcbaap_lo = virt_to_phys(xhci->dcbaa);
@@ -404,6 +409,7 @@
 	}
 	free(xhci->sp_ptrs);
 	free(xhci->dcbaa);
+	free(xhci->dev);
 	free((void *)xhci->ev_ring_table);
 	free((void *)xhci->er.ring);
 	free((void *)xhci->cr.ring);
@@ -437,15 +443,15 @@
 		    const int clear_halt)
 {
 	xhci_t *const xhci = XHCI_INST(dev->controller);
-	devinfo_t *const di = DEVINFO_FROM_XHCI(xhci, dev->address);
 	const int slot_id = dev->address;
 	const int ep_id = ep ? xhci_ep_id(ep) : 1;
+	epctx_t *const epctx = xhci->dev[slot_id].ctx.ep[ep_id];
 
 	xhci_debug("Resetting ID %d EP %d (ep state: %d)\n",
-		   slot_id, ep_id, EC_GET(STATE, di->devctx.eps[ep_id]));
+		   slot_id, ep_id, EC_GET(STATE, epctx));
 
 	/* Run Reset Endpoint Command if the EP is in Halted state */
-	if (EC_GET(STATE, di->devctx.eps[ep_id]) == 2) {
+	if (EC_GET(STATE, epctx) == 2) {
 		const int cc = xhci_cmd_reset_endpoint(xhci, slot_id, ep_id);
 		if (cc != CC_SUCCESS) {
 			xhci_debug("Reset Endpoint Command failed: %d\n", cc);
@@ -464,9 +470,10 @@
 		clear_stall(ep);
 
 	/* Reset transfer ring if the endpoint is in the right state */
-	const unsigned ep_state = EC_GET(STATE, di->devctx.eps[ep_id]);
+	const unsigned ep_state = EC_GET(STATE, epctx);
 	if (ep_state == 3 || ep_state == 4) {
-		transfer_ring_t *const tr = di->transfer_rings[ep_id];
+		transfer_ring_t *const tr =
+				xhci->dev[slot_id].transfer_rings[ep_id];
 		const int cc = xhci_cmd_set_tr_dq(xhci, slot_id, ep_id,
 						  tr->ring, 1);
 		if (cc != CC_SUCCESS) {
@@ -477,7 +484,7 @@
 	}
 
 	xhci_debug("Finished resetting ID %d EP %d (ep state: %d)\n",
-		   slot_id, ep_id, EC_GET(STATE, di->devctx.eps[ep_id]));
+		   slot_id, ep_id, EC_GET(STATE, epctx));
 
 	return 0;
 }
@@ -554,11 +561,12 @@
 static int
 xhci_control(usbdev_t *const dev, const direction_t dir,
 	     const int drlen, void *const devreq,
-	     const int dalen, unsigned char *const data)
+	     const int dalen, unsigned char *const src)
 {
+	unsigned char *data = src;
 	xhci_t *const xhci = XHCI_INST(dev->controller);
-	devinfo_t *const di = DEVINFO_FROM_XHCI(xhci, dev->address);
-	transfer_ring_t *const tr = di->transfer_rings[1];
+	epctx_t *const epctx = xhci->dev[dev->address].ctx.ep0;
+	transfer_ring_t *const tr = xhci->dev[dev->address].transfer_rings[1];
 
 	const size_t off = (size_t)data & 0xffff;
 	if ((off + dalen) > ((TRANSFER_RING_SIZE - 3) << 16)) {
@@ -567,12 +575,22 @@
 	}
 
 	/* Reset endpoint if it's halted */
-	const unsigned ep_state = EC_GET(STATE, di->devctx.ep0);
+	const unsigned ep_state = EC_GET(STATE, epctx);
 	if (ep_state == 2 || ep_state == 4) {
 		if (xhci_reset_endpoint(dev, NULL, 0))
 			return -1;
 	}
 
+	if (dalen && !dma_coherent(src)) {
+		data = xhci->dma_buffer;
+		if (dalen > DMA_SIZE) {
+			xhci_debug("Control transfer too large: %d\n", dalen);
+			return -1;
+		}
+		if (dir == OUT)
+			memcpy(data, src, dalen);
+	}
+
 	/* Fill and enqueue setup TRB */
 	trb_t *const setup = tr->cur;
 	xhci_clear_trb(setup, tr->pcs);
@@ -589,7 +607,7 @@
 
 	/* Fill and enqueue data TRBs (if any) */
 	if (dalen) {
-		const unsigned mps = EC_GET(MPS, di->devctx.ep0);
+		const unsigned mps = EC_GET(MPS, epctx);
 		const unsigned dt_dir = (dir == OUT) ? TRB_DIR_OUT : TRB_DIR_IN;
 		xhci_enqueue_td(tr, 1, mps, dalen, data, dt_dir);
 	}
@@ -625,28 +643,31 @@
 				   "  usbsts:     0x%08"PRIx32"\n",
 				   i, n_stages, ret,
 				   tr->ring, setup, status,
-				   ep_state, EC_GET(STATE, di->devctx.ep0),
+				   ep_state, EC_GET(STATE, epctx),
 				   xhci->opreg->usbsts);
 			return ret;
 		}
 	}
 
+	if (dir == IN && data != src)
+		memcpy(src, data, dalen - residue);
 	return dalen - residue;
 }
 
 /* finalize == 1: if data is of packet aligned size, add a zero length packet */
 static int
-xhci_bulk(endpoint_t *const ep,
-	  const int size, u8 *const data,
+xhci_bulk(endpoint_t *const ep, const int size, u8 *const src,
 	  const int finalize)
 {
 	/* finalize: Hopefully the xHCI controller always does this.
 		     We have no control over the packets. */
 
+	u8 *data = src;
 	xhci_t *const xhci = XHCI_INST(ep->dev->controller);
+	const int slot_id = ep->dev->address;
 	const int ep_id = xhci_ep_id(ep);
-	devinfo_t *const di = DEVINFO_FROM_XHCI(xhci, ep->dev->address);
-	transfer_ring_t *const tr = di->transfer_rings[ep_id];
+	epctx_t *const epctx = xhci->dev[slot_id].ctx.ep[ep_id];
+	transfer_ring_t *const tr = xhci->dev[slot_id].transfer_rings[ep_id];
 
 	const size_t off = (size_t)data & 0xffff;
 	if ((off + size) > ((TRANSFER_RING_SIZE - 1) << 16)) {
@@ -654,15 +675,25 @@
 		return -1;
 	}
 
+	if (!dma_coherent(src)) {
+		data = xhci->dma_buffer;
+		if (size > DMA_SIZE) {
+			xhci_debug("Bulk transfer too large: %d\n", size);
+			return -1;
+		}
+		if (ep->direction == OUT)
+			memcpy(data, src, size);
+	}
+
 	/* Reset endpoint if it's halted */
-	const unsigned ep_state = EC_GET(STATE, di->devctx.eps[ep_id]);
+	const unsigned ep_state = EC_GET(STATE, epctx);
 	if (ep_state == 2 || ep_state == 4) {
 		if (xhci_reset_endpoint(ep->dev, ep, 0))
 			return -1;
 	}
 
 	/* Enqueue transfer and ring doorbell */
-	const unsigned mps = EC_GET(MPS, di->devctx.eps[ep_id]);
+	const unsigned mps = EC_GET(MPS, epctx);
 	const unsigned dir = (ep->direction == OUT) ? TRB_DIR_OUT : TRB_DIR_IN;
 	xhci_enqueue_td(tr, ep_id, mps, size, data, dir);
 	xhci->dbreg[ep->dev->address] = ep_id;
@@ -681,11 +712,13 @@
 			   "  ep state: %d -> %d\n"
 			   "  usbsts:   0x%08"PRIx32"\n",
 			   ret, ep_state,
-			   EC_GET(STATE, di->devctx.eps[ep_id]),
+			   EC_GET(STATE, epctx),
 			   xhci->opreg->usbsts);
 		return ret;
 	}
 
+	if (ep->direction == IN && data != src)
+		memcpy(src, data, size - ret);
 	return size - ret;
 }
 
@@ -711,9 +744,9 @@
 		      endpoint descriptor configured earlier. */
 
 	xhci_t *const xhci = XHCI_INST(ep->dev->controller);
+	const int slot_id = ep->dev->address;
 	const int ep_id = xhci_ep_id(ep);
-	devinfo_t *const di = DEVINFO_FROM_XHCI(xhci, ep->dev->address);
-	transfer_ring_t *const tr = di->transfer_rings[ep_id];
+	transfer_ring_t *const tr = xhci->dev[slot_id].transfer_rings[ep_id];
 
 	if (reqcount > (TRANSFER_RING_SIZE - 2)) {
 		xhci_debug("reqcount is too high, at most %d supported\n",
@@ -724,7 +757,7 @@
 		xhci_debug("reqsize is too large, at most 64KiB supported\n");
 		return NULL;
 	}
-	if (di->interrupt_queues[ep_id]) {
+	if (xhci->dev[slot_id].interrupt_queues[ep_id]) {
 		xhci_debug("Only one interrupt queue per endpoint supported\n");
 		return NULL;
 	}
@@ -766,13 +799,13 @@
 	intrq->next	= tr->cur;
 	intrq->ready	= NULL;
 	intrq->ep	= ep;
-	di->interrupt_queues[ep_id] = intrq;
+	xhci->dev[slot_id].interrupt_queues[ep_id] = intrq;
 
 	/* Now enqueue all the prepared TRBs but the last
 	   and ring the doorbell. */
 	for (i = 0; i < (reqcount - 1); ++i)
 		xhci_enqueue_trb(tr);
-	xhci->dbreg[ep->dev->address] = ep_id;
+	xhci->dbreg[slot_id] = ep_id;
 
 	return intrq;
 
@@ -791,16 +824,15 @@
 xhci_destroy_intr_queue(endpoint_t *const ep, void *const q)
 {
 	xhci_t *const xhci = XHCI_INST(ep->dev->controller);
+	const int slot_id = ep->dev->address;
 	const int ep_id = xhci_ep_id(ep);
-	devinfo_t *const di = DEVINFO_FROM_XHCI(xhci, ep->dev->address);
-	transfer_ring_t *const tr = di->transfer_rings[ep_id];
+	transfer_ring_t *const tr = xhci->dev[slot_id].transfer_rings[ep_id];
 
 	intrq_t *const intrq = (intrq_t *)q;
 
 	/* Make sure the endpoint is stopped */
-	if (EC_GET(STATE, di->devctx.eps[ep_id]) == 1) {
-		const int cc = xhci_cmd_stop_endpoint(
-				xhci, ep->dev->address, ep_id);
+	if (EC_GET(STATE, xhci->dev[slot_id].ctx.ep[ep_id]) == 1) {
+		const int cc = xhci_cmd_stop_endpoint(xhci, slot_id, ep_id);
 		if (cc != CC_SUCCESS)
 			xhci_debug("Warning: Failed to stop endpoint\n");
 	}
@@ -814,11 +846,11 @@
 		free(phys_to_virt(intrq->next->ptr_low));
 		intrq->next = xhci_next_trb(intrq->next, NULL);
 	}
-	di->interrupt_queues[ep_id] = NULL;
+	xhci->dev[slot_id].interrupt_queues[ep_id] = NULL;
 	free((void *)intrq);
 
 	/* Reset the controller's dequeue pointer and reinitialize the ring */
-	xhci_cmd_set_tr_dq(xhci, ep->dev->address, ep_id, tr->ring, 1);
+	xhci_cmd_set_tr_dq(xhci, slot_id, ep_id, tr->ring, 1);
 	xhci_init_cycle_ring(tr, TRANSFER_RING_SIZE);
 }
 
@@ -843,8 +875,8 @@
 	u8 *reqdata = NULL;
 	while (!reqdata && intrq->ready) {
 		const int ep_id = xhci_ep_id(ep);
-		devinfo_t *const di = DEVINFO_FROM_XHCI(xhci, ep->dev->address);
-		transfer_ring_t *const tr = di->transfer_rings[ep_id];
+		transfer_ring_t *const tr =
+			xhci->dev[ep->dev->address].transfer_rings[ep_id];
 
 		/* Fetch the request's buffer */
 		reqdata = phys_to_virt(intrq->next->ptr_low);
diff --git a/payloads/libpayload/drivers/usb/xhci_commands.c b/payloads/libpayload/drivers/usb/xhci_commands.c
index 3a744b3..009a69c 100644
--- a/payloads/libpayload/drivers/usb/xhci_commands.c
+++ b/payloads/libpayload/drivers/usb/xhci_commands.c
@@ -128,7 +128,7 @@
 	trb_t *const cmd = xhci_next_command_trb(xhci);
 	TRB_SET(TT, cmd, TRB_CMD_ADDRESS_DEV);
 	TRB_SET(ID, cmd, slot_id);
-	cmd->ptr_low = virt_to_phys(ic);
+	cmd->ptr_low = virt_to_phys(ic->raw);
 	xhci_post_command(xhci);
 
 	return xhci_wait_for_command(xhci, cmd, 1);
@@ -143,7 +143,7 @@
 	trb_t *const cmd = xhci_next_command_trb(xhci);
 	TRB_SET(TT, cmd, TRB_CMD_CONFIGURE_EP);
 	TRB_SET(ID, cmd, slot_id);
-	cmd->ptr_low = virt_to_phys(ic);
+	cmd->ptr_low = virt_to_phys(ic->raw);
 	if (config_id == 0)
 		TRB_SET(DC, cmd, 1);
 	xhci_post_command(xhci);
@@ -159,7 +159,7 @@
 	trb_t *const cmd = xhci_next_command_trb(xhci);
 	TRB_SET(TT, cmd, TRB_CMD_EVAL_CTX);
 	TRB_SET(ID, cmd, slot_id);
-	cmd->ptr_low = virt_to_phys(ic);
+	cmd->ptr_low = virt_to_phys(ic->raw);
 	xhci_post_command(xhci);
 
 	return xhci_wait_for_command(xhci, cmd, 1);
diff --git a/payloads/libpayload/drivers/usb/xhci_debug.c b/payloads/libpayload/drivers/usb/xhci_debug.c
index ba644c6..913b545 100644
--- a/payloads/libpayload/drivers/usb/xhci_debug.c
+++ b/payloads/libpayload/drivers/usb/xhci_debug.c
@@ -40,18 +40,18 @@
 	usb_debug(" FIELD2\t0x%08"PRIx32"\n", sc->f2);
 	usb_debug(" FIELD3\t0x%08"PRIx32"\n", sc->f3);
 	usb_debug(" FIELD4\t0x%08"PRIx32"\n", sc->f4);
-	SC_DUMP(ROUTE,  *sc);
-	SC_DUMP(SPEED,  *sc);
-	SC_DUMP(MTT,    *sc);
-	SC_DUMP(HUB,    *sc);
-	SC_DUMP(CTXENT, *sc);
-	SC_DUMP(RHPORT, *sc);
-	SC_DUMP(NPORTS, *sc);
-	SC_DUMP(TTID,   *sc);
-	SC_DUMP(TTPORT, *sc);
-	SC_DUMP(TTT,    *sc);
-	SC_DUMP(UADDR,  *sc);
-	SC_DUMP(STATE,  *sc);
+	SC_DUMP(ROUTE,  sc);
+	SC_DUMP(SPEED,  sc);
+	SC_DUMP(MTT,    sc);
+	SC_DUMP(HUB,    sc);
+	SC_DUMP(CTXENT, sc);
+	SC_DUMP(RHPORT, sc);
+	SC_DUMP(NPORTS, sc);
+	SC_DUMP(TTID,   sc);
+	SC_DUMP(TTPORT, sc);
+	SC_DUMP(TTT,    sc);
+	SC_DUMP(UADDR,  sc);
+	SC_DUMP(STATE,  sc);
 }
 
 void
@@ -63,15 +63,15 @@
 	usb_debug(" TRDQ_L\t0x%08"PRIx32"\n", ec->tr_dq_low);
 	usb_debug(" TRDQ_H\t0x%08"PRIx32"\n", ec->tr_dq_high);
 	usb_debug(" FIELD5\t0x%08"PRIx32"\n", ec->f5);
-	EC_DUMP(STATE,  *ec);
-	EC_DUMP(INTVAL, *ec);
-	EC_DUMP(CERR,   *ec);
-	EC_DUMP(TYPE,   *ec);
-	EC_DUMP(MBS,    *ec);
-	EC_DUMP(MPS,    *ec);
-	EC_DUMP(DCS,    *ec);
-	EC_DUMP(AVRTRB, *ec);
-	EC_DUMP(MXESIT, *ec);
+	EC_DUMP(STATE,  ec);
+	EC_DUMP(INTVAL, ec);
+	EC_DUMP(CERR,   ec);
+	EC_DUMP(TYPE,   ec);
+	EC_DUMP(MBS,    ec);
+	EC_DUMP(MPS,    ec);
+	EC_DUMP(DCS,    ec);
+	EC_DUMP(AVRTRB, ec);
+	EC_DUMP(MXESIT, ec);
 }
 
 void
@@ -79,19 +79,19 @@
 {
 	int i;
 	if (ctx_mask & 1)
-		xhci_dump_slotctx(&dc->slot);
+		xhci_dump_slotctx(dc->slot);
 	for (i = 0; i < SC_GET(CTXENT, dc->slot); ++i) {
 		if (ctx_mask & (2 << i))
-			xhci_dump_epctx(&dc->all_eps[i]);
+			xhci_dump_epctx(dc->ep[i]);
 	}
 }
 
 void
 xhci_dump_inputctx(const inputctx_t *const ic)
 {
-	xhci_debug("Input Control  add: 0x%08"PRIx32"\n", ic->control.add);
-	xhci_debug("Input Control drop: 0x%08"PRIx32"\n", ic->control.drop);
-	xhci_dump_devctx(&ic->dev, ic->control.add);
+	xhci_debug("Input Control  add: 0x%08"PRIx32"\n", *ic->add);
+	xhci_debug("Input Control drop: 0x%08"PRIx32"\n", *ic->drop);
+	xhci_dump_devctx(&ic->dev, *ic->add);
 }
 
 void
diff --git a/payloads/libpayload/drivers/usb/xhci_devconf.c b/payloads/libpayload/drivers/usb/xhci_devconf.c
index ff5c4d0..0bf93cf 100644
--- a/payloads/libpayload/drivers/usb/xhci_devconf.c
+++ b/payloads/libpayload/drivers/usb/xhci_devconf.c
@@ -37,9 +37,7 @@
 {
 	if (!hubaddr)
 		return 0;
-	volatile const devctx_t *const devctx =
-		phys_to_virt(xhci->dcbaa[hubaddr]);
-	u32 route_string = SC_GET(ROUTE, devctx->slot);
+	u32 route_string = SC_GET(ROUTE, xhci->dev[hubaddr].ctx.slot);
 	int i;
 	for (i = 0; i < 20; i += 4) {
 		if (!(route_string & (0xf << i))) {
@@ -55,9 +53,7 @@
 {
 	if (!hubaddr)
 		return hubport;
-	volatile const devctx_t *const devctx =
-		phys_to_virt(xhci->dcbaa[hubaddr]);
-	return SC_GET(RHPORT, devctx->slot);
+	return SC_GET(RHPORT, xhci->dev[hubaddr].ctx.slot);
 }
 
 static int
@@ -67,12 +63,11 @@
 {
 	if (!hubaddr)
 		return 0;
-	volatile const devctx_t *const devctx =
-		phys_to_virt(xhci->dcbaa[hubaddr]);
-	if ((*tt = SC_GET(TTID, devctx->slot))) {
-		*tt_port = SC_GET(TTPORT, devctx->slot);
+	const slotctx_t *const slot = xhci->dev[hubaddr].ctx.slot;
+	if ((*tt = SC_GET(TTID, slot))) {
+		*tt_port = SC_GET(TTPORT, slot);
 	} else if (xhci_speed < XHCI_HIGH_SPEED &&
-			SC_GET(SPEED, devctx->slot) == XHCI_HIGH_SPEED) {
+			SC_GET(SPEED, slot) == XHCI_HIGH_SPEED) {
 		*tt = hubaddr;
 		*tt_port = hubport;
 	}
@@ -130,20 +125,45 @@
 	}
 }
 
+static inputctx_t *
+xhci_make_inputctx(const size_t ctxsize)
+{
+	int i;
+	const size_t size = (1 + NUM_EPS) * ctxsize;
+	inputctx_t *const ic = malloc(sizeof(*ic));
+	void *dma_buffer = dma_memalign(64, size);
+
+	if (!ic || !dma_buffer) {
+		free(ic);
+		free(dma_buffer);
+		return NULL;
+	}
+
+	memset(dma_buffer, 0, size);
+	ic->drop = dma_buffer + 0;
+	ic->add = dma_buffer + 4;
+	dma_buffer += ctxsize;
+	for (i = 0; i < NUM_EPS; i++, dma_buffer += ctxsize)
+		ic->dev.ep[i] = dma_buffer;
+
+	return ic;
+}
+
 int
 xhci_set_address (hci_t *controller, int speed, int hubport, int hubaddr)
 {
 	xhci_t *const xhci = XHCI_INST(controller);
 	const int xhci_speed = speed + 1;
+	const size_t ctxsize = CTXSIZE(xhci);
+	devinfo_t *di = NULL;
 
-	int ret = -1;
+	int i, ret = -1;
 
-	inputctx_t *const ic = xhci_align(64, sizeof(*ic));
-	devinfo_t *const di = memalign(sizeof(di->devctx), sizeof(*di));
+	inputctx_t *const ic = xhci_make_inputctx(ctxsize);
 	transfer_ring_t *const tr = malloc(sizeof(*tr));
 	if (tr)
 		tr->ring = xhci_align(16, TRANSFER_RING_SIZE * sizeof(trb_t));
-	if (!ic || !di || !tr || !tr->ring) {
+	if (!ic || !tr || !tr->ring) {
 		xhci_debug("Out of memory\n");
 		goto _free_return;
 	}
@@ -157,9 +177,15 @@
 		xhci_debug("Enabled slot %d\n", slot_id);
 	}
 
-	memset(ic, 0x00, sizeof(*ic));
-	ic->control.add = (1 << 0) /* Slot Context */ |
-			  (1 << 1) /* EP0 Context */ ;
+	di = &xhci->dev[slot_id];
+	void *dma_buffer = dma_memalign(64, NUM_EPS * ctxsize);
+	if (!dma_buffer)
+		goto _free_return;
+	memset(dma_buffer, 0, NUM_EPS * ctxsize);
+	for (i = 0; i < NUM_EPS; i++, dma_buffer += ctxsize)
+		di->ctx.ep[i] = dma_buffer;
+
+	*ic->add = (1 << 0) /* Slot Context */ | (1 << 1) /* EP0 Context */ ;
 
 	SC_SET(ROUTE,	ic->dev.slot, xhci_gen_route(xhci, hubport, hubaddr));
 	SC_SET(SPEED,	ic->dev.slot, xhci_speed);
@@ -169,27 +195,23 @@
 	int tt, tt_port;
 	if (xhci_get_tt(xhci, xhci_speed, hubport, hubaddr, &tt, &tt_port)) {
 		xhci_debug("TT for %d: %d[%d]\n", slot_id, tt, tt_port);
-		volatile const devctx_t *const ttctx =
-			phys_to_virt(xhci->dcbaa[tt]);
-		SC_SET(MTT, ic->dev.slot, SC_GET(MTT, ttctx->slot));
+		SC_SET(MTT, ic->dev.slot, SC_GET(MTT, xhci->dev[tt].ctx.slot));
 		SC_SET(TTID, ic->dev.slot, tt);
 		SC_SET(TTPORT, ic->dev.slot, tt_port);
 	}
 
-	memset(di, 0x00, sizeof(*di));
 	di->transfer_rings[1] = tr;
 	xhci_init_cycle_ring(tr, TRANSFER_RING_SIZE);
 
-	ic->dev.ep0.tr_dq_low	= virt_to_phys(tr->ring);
-	ic->dev.ep0.tr_dq_high	= 0;
+	ic->dev.ep0->tr_dq_low	= virt_to_phys(tr->ring);
+	ic->dev.ep0->tr_dq_high	= 0;
 	EC_SET(TYPE,	ic->dev.ep0, EP_CONTROL);
 	EC_SET(AVRTRB,	ic->dev.ep0, 8);
 	EC_SET(MPS,	ic->dev.ep0, 8);
 	EC_SET(CERR,	ic->dev.ep0, 3);
 	EC_SET(DCS,	ic->dev.ep0, 1);
 
-	volatile devctx_t *const oc = &di->devctx;
-	xhci->dcbaa[slot_id] = virt_to_phys(oc);
+	xhci->dcbaa[slot_id] = virt_to_phys(di->ctx.raw);
 
 	cc = xhci_cmd_address_device(xhci, slot_id, ic);
 	if (cc != CC_SUCCESS) {
@@ -197,7 +219,7 @@
 		goto _disable_return;
 	} else {
 		xhci_debug("Addressed device %d (USB: %d)\n",
-			  slot_id, SC_GET(UADDR, oc->slot));
+			  slot_id, SC_GET(UADDR, di->ctx.slot));
 	}
 	mdelay(2); /* SetAddress() recovery interval (usb20 spec 9.2.6.3) */
 
@@ -209,9 +231,8 @@
 	if (mps0 < 0) {
 		goto _disable_return;
 	} else if (mps0 != 8) {
-		memset(&ic->control, 0x00, sizeof(ic->control));
-		memset(&ic->dev.ep0, 0x00, sizeof(ic->dev.ep0));
-		ic->control.add = (1 << 1); /* EP0 Context */
+		memset((void *)ic->dev.ep0, 0x00, ctxsize);
+		*ic->add = (1 << 1); /* EP0 Context */
 		EC_SET(MPS, ic->dev.ep0, mps0);
 		cc = xhci_cmd_evaluate_context(xhci, slot_id, ic);
 		if (cc != CC_SUCCESS) {
@@ -232,8 +253,12 @@
 	if (tr)
 		free((void *)tr->ring);
 	free(tr);
+	if (di)
+		free(di->ctx.raw);
 	free((void *)di);
 _free_ic_return:
+	if (ic)
+		free(ic->raw);
 	free(ic);
 	return ret;
 }
@@ -291,8 +316,6 @@
 xhci_finish_ep_config(const endpoint_t *const ep, inputctx_t *const ic)
 {
 	xhci_t *const xhci = XHCI_INST(ep->dev->controller);
-	devinfo_t *const di = phys_to_virt(xhci->dcbaa[ep->dev->address]
-					   - offsetof(devinfo_t, devctx));
 	const int ep_id = xhci_ep_id(ep);
 	xhci_debug("ep_id: %d\n", ep_id);
 	if (ep_id <= 1 || 32 <= ep_id)
@@ -306,30 +329,30 @@
 		xhci_debug("Out of memory\n");
 		return OUT_OF_MEMORY;
 	}
-	di->transfer_rings[ep_id] = tr;
+	xhci->dev[ep->dev->address].transfer_rings[ep_id] = tr;
 	xhci_init_cycle_ring(tr, TRANSFER_RING_SIZE);
 
-	ic->control.add |= (1 << ep_id);
+	*ic->add |= (1 << ep_id);
 	if (SC_GET(CTXENT, ic->dev.slot) < ep_id)
 		SC_SET(CTXENT, ic->dev.slot, ep_id);
 
-	epctx_t *const epctx = &ic->dev.eps[ep_id];
+	epctx_t *const epctx = ic->dev.ep[ep_id];
 	xhci_debug("Filling epctx (@%p)\n", epctx);
 	epctx->tr_dq_low	= virt_to_phys(tr->ring);
 	epctx->tr_dq_high	= 0;
-	EC_SET(INTVAL,	*epctx, xhci_bound_interval(ep));
-	EC_SET(CERR,	*epctx, 3);
-	EC_SET(TYPE,	*epctx, ep->type | ((ep->direction != OUT) << 2));
-	EC_SET(MPS,	*epctx, ep->maxpacketsize);
-	EC_SET(DCS,	*epctx, 1);
+	EC_SET(INTVAL,	epctx, xhci_bound_interval(ep));
+	EC_SET(CERR,	epctx, 3);
+	EC_SET(TYPE,	epctx, ep->type | ((ep->direction != OUT) << 2));
+	EC_SET(MPS,	epctx, ep->maxpacketsize);
+	EC_SET(DCS,	epctx, 1);
 	size_t avrtrb;
 	switch (ep->type) {
 		case BULK: case ISOCHRONOUS:	avrtrb = 3 * 1024; break;
 		case INTERRUPT:			avrtrb =     1024; break;
 		default:			avrtrb =        8; break;
 	}
-	EC_SET(AVRTRB,	*epctx, avrtrb);
-	EC_SET(MXESIT,  *epctx, EC_GET(MPS, *epctx) * EC_GET(MBS, *epctx));
+	EC_SET(AVRTRB,	epctx, avrtrb);
+	EC_SET(MXESIT,  epctx, EC_GET(MPS, epctx) * EC_GET(MBS, epctx));
 
 	return 0;
 }
@@ -338,24 +361,22 @@
 xhci_finish_device_config(usbdev_t *const dev)
 {
 	xhci_t *const xhci = XHCI_INST(dev->controller);
-	devinfo_t *const di = phys_to_virt(xhci->dcbaa[dev->address]
-					   - offsetof(devinfo_t, devctx));
+	devinfo_t *const di = &xhci->dev[dev->address];
 
 	int i, ret = 0;
 
-	inputctx_t *const ic = xhci_align(64, sizeof(*ic));
+	inputctx_t *const ic = xhci_make_inputctx(CTXSIZE(xhci));
 	if (!ic) {
 		xhci_debug("Out of memory\n");
 		return OUT_OF_MEMORY;
 	}
-	memset(ic, 0x00, sizeof(*ic));
 
-	ic->control.add = (1 << 0); /* Slot Context */
+	*ic->add = (1 << 0); /* Slot Context */
 
-	xhci_dump_slotctx((const slotctx_t *)&di->devctx.slot);
-	ic->dev.slot.f1 = di->devctx.slot.f1;
-	ic->dev.slot.f2 = di->devctx.slot.f2;
-	ic->dev.slot.f3 = di->devctx.slot.f3;
+	xhci_dump_slotctx(di->ctx.slot);
+	ic->dev.slot->f1 = di->ctx.slot->f1;
+	ic->dev.slot->f2 = di->ctx.slot->f2;
+	ic->dev.slot->f3 = di->ctx.slot->f3;
 
 	if (((device_descriptor_t *)dev->descriptor)->bDeviceClass == 0x09) {
 		ret = xhci_finish_hub_config(dev, ic);
@@ -394,6 +415,7 @@
 		di->transfer_rings[i] = NULL;
 	}
 _free_return:
+	free(ic->raw);
 	free(ic);
 	return ret;
 }
@@ -412,7 +434,7 @@
 	if (cc != CC_SUCCESS)
 		xhci_debug("Failed to disable slot %d: %d\n", slot_id, cc);
 
-	devinfo_t *const di = DEVINFO_FROM_XHCI(xhci, slot_id);
+	devinfo_t *const di = &xhci->dev[slot_id];
 	for (i = 1; i < 31; ++i) {
 		if (di->transfer_rings[i])
 			free((void *)di->transfer_rings[i]->ring);
@@ -420,6 +442,5 @@
 
 		free(di->interrupt_queues[i]);
 	}
-	free(di);
 	xhci->dcbaa[slot_id] = 0;
 }
diff --git a/payloads/libpayload/drivers/usb/xhci_events.c b/payloads/libpayload/drivers/usb/xhci_events.c
index c27cc7a..b947c7d 100644
--- a/payloads/libpayload/drivers/usb/xhci_events.c
+++ b/payloads/libpayload/drivers/usb/xhci_events.c
@@ -86,12 +86,10 @@
 	const int id = TRB_GET(ID, ev);
 	const int ep = TRB_GET(EP, ev);
 
-	devinfo_t *di;
 	intrq_t *intrq;
 
 	if (id && id <= xhci->max_slots_en &&
-			(di = DEVINFO_FROM_XHCI(xhci, id)) &&
-			(intrq = di->interrupt_queues[ep])) {
+			(intrq = xhci->dev[id].interrupt_queues[ep])) {
 		/* It's a running interrupt endpoint */
 		intrq->ready = phys_to_virt(ev->ptr_low);
 		if (cc == CC_SUCCESS || cc == CC_SHORT_PACKET) {
diff --git a/payloads/libpayload/drivers/usb/xhci_private.h b/payloads/libpayload/drivers/usb/xhci_private.h
index 5cc115a..0569533 100644
--- a/payloads/libpayload/drivers/usb/xhci_private.h
+++ b/payloads/libpayload/drivers/usb/xhci_private.h
@@ -198,13 +198,13 @@
 #define SC_STATE_START		27
 #define SC_STATE_LEN		8
 #define SC_MASK(tok)		MASK(SC_##tok##_START, SC_##tok##_LEN)
-#define SC_GET(tok, sc)		(((sc).SC_##tok##_FIELD & SC_MASK(tok)) \
+#define SC_GET(tok, sc)		(((sc)->SC_##tok##_FIELD & SC_MASK(tok)) \
 				 >> SC_##tok##_START)
-#define SC_SET(tok, sc, to)	(sc).SC_##tok##_FIELD = \
-				(((sc).SC_##tok##_FIELD & ~SC_MASK(tok)) | \
+#define SC_SET(tok, sc, to)	(sc)->SC_##tok##_FIELD = \
+				(((sc)->SC_##tok##_FIELD & ~SC_MASK(tok)) | \
 				 (((to) << SC_##tok##_START) & SC_MASK(tok)))
 #define SC_DUMP(tok, sc)	usb_debug(" "#tok"\t0x%04"PRIx32"\n", SC_GET(tok, sc))
-typedef struct slotctx {
+typedef volatile struct slotctx {
 	u32 f1;
 	u32 f2;
 	u32 f3;
@@ -240,15 +240,15 @@
 #define EC_MXESIT_START		16
 #define EC_MXESIT_LEN		16
 #define EC_MASK(tok)		MASK(EC_##tok##_START, EC_##tok##_LEN)
-#define EC_GET(tok, ec)		(((ec).EC_##tok##_FIELD & EC_MASK(tok)) \
+#define EC_GET(tok, ec)		(((ec)->EC_##tok##_FIELD & EC_MASK(tok)) \
 				 >> EC_##tok##_START)
-#define EC_SET(tok, ec, to)	(ec).EC_##tok##_FIELD = \
-				(((ec).EC_##tok##_FIELD & ~EC_MASK(tok)) | \
+#define EC_SET(tok, ec, to)	(ec)->EC_##tok##_FIELD = \
+				(((ec)->EC_##tok##_FIELD & ~EC_MASK(tok)) | \
 				 (((to) << EC_##tok##_START) & EC_MASK(tok)))
 #define EC_DUMP(tok, ec)	usb_debug(" "#tok"\t0x%04"PRIx32"\n", EC_GET(tok, ec))
 enum { EP_ISOC_OUT = 1, EP_BULK_OUT = 2, EP_INTR_OUT = 3,
 	EP_CONTROL = 4, EP_ISOC_IN = 5, EP_BULK_IN = 6, EP_INTR_IN = 7 };
-typedef struct epctx {
+typedef volatile struct epctx {
 	u32 f1;
 	u32 f2;
 	u32 tr_dq_low;
@@ -257,23 +257,30 @@
 	u32 rsvd[3];
 } epctx_t;
 
+#define NUM_EPS 32
+#define CTXSIZE(xhci) ((xhci)->capreg->csz ? 64 : 32)
+
 typedef union devctx {
+	/* set of pointers, so we can dynamically adjust Slot/EP context size */
 	struct {
-		slotctx_t slot;
-		epctx_t ep0;
-		epctx_t eps1_30[30];
+		union {
+			slotctx_t *slot;
+			void *raw;	/* Pointer to the whole dev context. */
+		};
+		epctx_t *ep0;
+		epctx_t *eps1_30[NUM_EPS - 2];
 	};
-	epctx_t eps[32]; /* At index 0 it's actually the slotctx,
-			    we have it like that so we can use
-			    the ep_id directly as index. */
+	epctx_t *ep[NUM_EPS];	/* At index 0 it's actually the slotctx,
+					we have it like that so we can use
+					the ep_id directly as index. */
 } devctx_t;
 
 typedef struct inputctx {
-	struct {
-		u32 drop;
-		u32 add;
-		u32 reserved[6];
-	} control;
+	union {		    /* The drop flags are located at the start of the */
+		u32 *drop;  /* structure, so a pointer to them is equivalent */
+		void *raw;  /* to a pointer to the whole (raw) input context. */
+	};
+	u32 *add;
 	devctx_t dev;
 } inputctx_t;
 
@@ -286,14 +293,10 @@
 } intrq_t;
 
 typedef struct devinfo {
-	volatile devctx_t devctx;
-	transfer_ring_t *transfer_rings[32];
+	devctx_t ctx;
+	transfer_ring_t *transfer_rings[NUM_EPS];
 	intrq_t *interrupt_queues[32];
 } devinfo_t;
-#define DEVINFO_FROM_XHCI(xhci, slot_id) \
-	(((xhci)->dcbaa[slot_id]) \
-	 ? phys_to_virt((xhci)->dcbaa[slot_id] - offsetof(devinfo_t, devctx)) \
-	 : NULL)
 
 typedef struct erst_entry {
 	u32 seg_base_lo;
@@ -459,6 +462,10 @@
 	usbdev_t *roothub;
 
 	u8 max_slots_en;
+	devinfo_t *dev;	/* array of devinfos by slot_id */
+
+#define DMA_SIZE (64 * 1024)
+	void *dma_buffer;
 } xhci_t;
 
 #define XHCI_INST(controller) ((xhci_t*)((controller)->instance))
diff --git a/payloads/libpayload/drivers/usb/xhci_rh.c b/payloads/libpayload/drivers/usb/xhci_rh.c
index e6052be..ca6131f 100644
--- a/payloads/libpayload/drivers/usb/xhci_rh.c
+++ b/payloads/libpayload/drivers/usb/xhci_rh.c
@@ -27,7 +27,7 @@
  * SUCH DAMAGE.
  */
 
-#define USB_DEBUG
+//#define USB_DEBUG
 
 #include <usb/usb.h>
 #include "generic_hub.h"