libpayload: Make EHCI driver cache-aware

This patch makes the EHCI driver work on ARM platforms which usually do
not support automatic cache snooping. It uses the new DMA memory
mechanism (which needs to be correctly set up in the Coreboot mainboard
code) to allocate all EHCI-internal communication structures in
cache-coherent memory, and cleans/invalidates the externally supplied
transfer buffers in Bulk and Control functions with explicit calls as
necessary.

BUG=chrome-os-partner:21969
TEST=Make sure booting from the EHCI port now works without any
additional tweaks.

Change-Id: Ie8a62545d905b7a4fdd2a56b9405774be69779e5
Signed-off-by: Julius Werner <jwerner@chromium.org>
Reviewed-on: https://chromium-review.googlesource.com/167339
diff --git a/payloads/libpayload/drivers/usb/ehci.c b/payloads/libpayload/drivers/usb/ehci.c
index e49f8ab..d45e51d 100644
--- a/payloads/libpayload/drivers/usb/ehci.c
+++ b/payloads/libpayload/drivers/usb/ehci.c
@@ -30,6 +30,7 @@
 //#define USB_DEBUG
 
 #include <libpayload.h>
+#include <arch/cache.h>
 #include "ehci.h"
 #include "ehci_private.h"
 
@@ -361,7 +362,7 @@
 static int ehci_bulk (endpoint_t *ep, int size, u8 *data, int finalize)
 {
 	int result = 0;
-	int remaining = size;
+	int counter = 0;
 	int endp = ep->endpoint & 0xf;
 	int pid = (ep->direction==IN)?EHCI_IN:EHCI_OUT;
 
@@ -372,30 +373,30 @@
 			return -1;
 	}
 
-	qtd_t *head = memalign(64, sizeof(qtd_t));
+	qtd_t *head = dma_memalign(64, sizeof(qtd_t));
 	qtd_t *cur = head;
+	dcache_clean_by_mva(data, size);
 	while (1) {
 		memset((void *)cur, 0, sizeof(qtd_t));
 		cur->token = QTD_ACTIVE |
 			(pid << QTD_PID_SHIFT) |
 			(0 << QTD_CERR_SHIFT);
-		u32 chunk = fill_td(cur, data, remaining);
-		remaining -= chunk;
-		data += chunk;
+		u32 chunk = fill_td(cur, data + counter, size - counter);
+		counter += chunk;
 
 		cur->alt_next_qtd = QTD_TERMINATE;
-		if (remaining == 0) {
+		if (counter >= size) {
 			cur->next_qtd = virt_to_phys(0) | QTD_TERMINATE;
 			break;
 		} else {
-			qtd_t *next = memalign(64, sizeof(qtd_t));
+			qtd_t *next = dma_memalign(64, sizeof(qtd_t));
 			cur->next_qtd = virt_to_phys(next);
 			cur = next;
 		}
 	}
 
 	/* create QH */
-	ehci_qh_t *qh = memalign(64, sizeof(ehci_qh_t));
+	ehci_qh_t *qh = dma_memalign(64, sizeof(ehci_qh_t));
 	memset((void *)qh, 0, sizeof(ehci_qh_t));
 	qh->horiz_link_ptr = virt_to_phys(qh) | QH_QH;
 	qh->epchar = ep->dev->address |
@@ -415,6 +416,7 @@
 
 	result = ehci_process_async_schedule(
 			EHCI_INST(ep->dev->controller), qh, head);
+	dcache_invalidate_by_mva(data, size);
 	if (result >= 0)
 		result = size - result;
 
@@ -443,8 +445,10 @@
 		non_hs_ctrl_ep = 1;
 	}
 
+
 	/* create qTDs */
-	qtd_t *head = memalign(64, sizeof(qtd_t));
+	dcache_clean_by_mva(devreq, drlen);
+	qtd_t *head = dma_memalign(64, sizeof(qtd_t));
 	qtd_t *cur = head;
 	memset((void *)cur, 0, sizeof(qtd_t));
 	cur->token = QTD_ACTIVE |
@@ -454,13 +458,14 @@
 	if (fill_td(cur, devreq, drlen) != drlen) {
 		usb_debug("ERROR: couldn't send the entire device request\n");
 	}
-	qtd_t *next = memalign(64, sizeof(qtd_t));
+	qtd_t *next = dma_memalign(64, sizeof(qtd_t));
 	cur->next_qtd = virt_to_phys(next);
 	cur->alt_next_qtd = QTD_TERMINATE;
 
 	/* FIXME: We're limited to 16-20K (depending on alignment) for payload for now.
 	 * Figure out, how toggle can be set sensibly in this scenario */
 	if (dalen > 0) {
+		dcache_clean_by_mva(data, dalen);
 		toggle ^= 1;
 		cur = next;
 		memset((void *)cur, 0, sizeof(qtd_t));
@@ -471,7 +476,7 @@
 		if (fill_td(cur, data, dalen) != dalen) {
 			usb_debug("ERROR: couldn't send the entire control payload\n");
 		}
-		next = memalign(64, sizeof(qtd_t));
+		next = dma_memalign(64, sizeof(qtd_t));
 		cur->next_qtd = virt_to_phys(next);
 		cur->alt_next_qtd = QTD_TERMINATE;
 	}
@@ -488,7 +493,7 @@
 	cur->alt_next_qtd = QTD_TERMINATE;
 
 	/* create QH */
-	ehci_qh_t *qh = memalign(64, sizeof(ehci_qh_t));
+	ehci_qh_t *qh = dma_memalign(64, sizeof(ehci_qh_t));
 	memset((void *)qh, 0, sizeof(ehci_qh_t));
 	qh->horiz_link_ptr = virt_to_phys(qh) | QH_QH;
 	qh->epchar = dev->address |
@@ -509,6 +514,10 @@
 	if (result >= 0)
 		result = dalen - result;
 
+	dcache_invalidate_by_mva(devreq, drlen);
+	if (dalen > 0)
+		dcache_invalidate_by_mva(data, dalen);
+
 	free_qh_and_tds(qh, head);
 	return result;
 }
@@ -581,13 +590,12 @@
 			return NULL;
 	}
 
-	intr_queue_t *const intrq =
-		(intr_queue_t *)memalign(64, sizeof(intr_queue_t));
+	intr_queue_t *const intrq = (intr_queue_t *)malloc(sizeof(intr_queue_t));
 	/*
 	 * reqcount data chunks
 	 * plus one more spare, which we'll leave out of queue
 	 */
-	u8 *data = (u8 *)malloc(reqsize * (reqcount + 1));
+	u8 *data = (u8 *)dma_malloc(reqsize * (reqcount + 1));
 	if (!intrq || !data)
 		fatal("Not enough memory to create USB interrupt queue.\n");
 	intrq->data = data;
@@ -595,7 +603,7 @@
 	intrq->reqsize = reqsize;
 
 	/* create #reqcount transfer descriptors (qTDs) */
-	intrq->head = (intr_qtd_t *)memalign(64, sizeof(intr_qtd_t));
+	intrq->head = (intr_qtd_t *)dma_memalign(64, sizeof(intr_qtd_t));
 	intr_qtd_t *cur_td = intrq->head;
 	for (i = 0; i < reqcount; ++i) {
 		fill_intr_queue_td(intrq, cur_td, data);
@@ -603,7 +611,7 @@
 		if (i < reqcount - 1) {
 			/* create one more qTD */
 			intr_qtd_t *const next_td =
-				(intr_qtd_t *)memalign(64, sizeof(intr_qtd_t));
+				(intr_qtd_t *)dma_memalign(64, sizeof(intr_qtd_t));
 			cur_td->td.next_qtd = virt_to_phys(&next_td->td);
 			cur_td->next = next_td;
 			cur_td = next_td;
@@ -612,7 +620,7 @@
 	intrq->tail = cur_td;
 
 	/* create spare qTD */
-	intrq->spare = (intr_qtd_t *)memalign(64, sizeof(intr_qtd_t));
+	intrq->spare = (intr_qtd_t *)dma_memalign(64, sizeof(intr_qtd_t));
 	fill_intr_queue_td(intrq, intrq->spare, data);
 
 	/* initialize QH */
@@ -778,7 +786,7 @@
 
 	/* Initialize periodic frame list */
 	/* 1024 32-bit pointers, 4kb aligned */
-	u32 *const periodic_list = (u32 *)memalign(4096, 1024 * sizeof(u32));
+	u32 *const periodic_list = (u32 *)dma_memalign(4096, 1024 * sizeof(u32));
 	if (!periodic_list)
 		fatal("Not enough memory creating EHCI periodic frame list.\n");
 
@@ -787,7 +795,7 @@
 	 * This helps with broken host controllers
 	 * and doesn't violate the standard.
 	 */
-	EHCI_INST(controller)->dummy_qh = (ehci_qh_t *)memalign(64, sizeof(ehci_qh_t));
+	EHCI_INST(controller)->dummy_qh = (ehci_qh_t *)dma_memalign(64, sizeof(ehci_qh_t));
 	memset((void *)EHCI_INST(controller)->dummy_qh, 0,
 		sizeof(*EHCI_INST(controller)->dummy_qh));
 	EHCI_INST(controller)->dummy_qh->horiz_link_ptr = QH_TERMINATE;
diff --git a/payloads/libpayload/include/x86/arch/cache.h b/payloads/libpayload/include/x86/arch/cache.h
index 73849b7..4a80736 100644
--- a/payloads/libpayload/include/x86/arch/cache.h
+++ b/payloads/libpayload/include/x86/arch/cache.h
@@ -32,9 +32,13 @@
 #ifndef __ARCH_CACHE_H__
 #define __ARCH_CACHE_H__
 
-/* These are noops, needed by the USB stack on ARM */
+/* NOOPs mirroring ARM's cache API, since x86 devices usually cache snoop */
 #define dmb()
 #define dsb()
+#define dcache_clean_all()
+#define dcache_clean_by_mva(addr, len)
+#define dcache_invalidate_all()
+#define dcache_invalidate_by_mva(addr, len)
 #define dcache_clean_invalidate_all()
 #define dcache_clean_invalidate_by_mva(addr, len)