nyan: tegra124: Enable I, D and L2 caches in romstage.

This speeds up execution but does require cache management in drivers.

BUG=None
TEST=Built and booted into depthcharge on nyan. Measured a speed up in
execution.
BRANCH=None

Change-Id: I7efe6af2c38e41402fa874ed59798f136e7e8ad4
Signed-off-by: Gabe Black <gabeblack@google.com>
Reviewed-on: https://chromium-review.googlesource.com/173777
Reviewed-by: Gabe Black <gabeblack@chromium.org>
Commit-Queue: Gabe Black <gabeblack@chromium.org>
Tested-by: Gabe Black <gabeblack@chromium.org>
diff --git a/src/arch/arm/armv7/Makefile.inc b/src/arch/arm/armv7/Makefile.inc
index 481898d..99560dd 100644
--- a/src/arch/arm/armv7/Makefile.inc
+++ b/src/arch/arm/armv7/Makefile.inc
@@ -45,6 +45,9 @@
 ifeq ($(CONFIG_ARM_ROMSTAGE_ARMV7),y)
 
 romstage-y += cache.c
+romstage-y += exception.c
+romstage-y += exception_asm.S
+romstage-y += mmu.c
 
 romstage-c-ccopts += $(armv7_flags)
 romstage-S-ccopts += $(armv7_flags)
diff --git a/src/mainboard/google/nyan/Kconfig b/src/mainboard/google/nyan/Kconfig
index 9164489..99a39e0 100644
--- a/src/mainboard/google/nyan/Kconfig
+++ b/src/mainboard/google/nyan/Kconfig
@@ -40,6 +40,14 @@
 	int
 	default 2048
 
+config DRAM_DMA_START
+	hex
+	default 0x83000000
+
+config DRAM_DMA_SIZE
+	hex
+	default 0x00100000
+
 choice
 	prompt "BCT boot media"
 	default BCT_CFG_SPI
diff --git a/src/mainboard/google/nyan/mainboard.c b/src/mainboard/google/nyan/mainboard.c
index d7a6656..466c14e 100644
--- a/src/mainboard/google/nyan/mainboard.c
+++ b/src/mainboard/google/nyan/mainboard.c
@@ -223,3 +223,14 @@
 	.name	= "nyan",
 	.enable_dev = mainboard_enable,
 };
+
+void lb_board(struct lb_header *header)
+{
+	struct lb_range *dma;
+
+	dma = (struct lb_range *)lb_new_record(header);
+	dma->tag = LB_TAB_DMA;
+	dma->size = sizeof(*dma);
+	dma->range_start = CONFIG_DRAM_DMA_START;
+	dma->range_size = CONFIG_DRAM_DMA_SIZE;
+}
diff --git a/src/mainboard/google/nyan/romstage.c b/src/mainboard/google/nyan/romstage.c
index 5a66dde..a31f1f1 100644
--- a/src/mainboard/google/nyan/romstage.c
+++ b/src/mainboard/google/nyan/romstage.c
@@ -17,6 +17,9 @@
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include <arch/cache.h>
+#include <arch/cpu.h>
+#include <arch/exception.h>
 #include <arch/stages.h>
 #include <device/device.h>
 #include <cbfs.h>
@@ -25,11 +28,78 @@
 #include "soc/nvidia/tegra124/chip.h"
 #include <soc/display.h>
 
+// Convenient shorthand (in MB)
+#define DRAM_START	(CONFIG_SYS_SDRAM_BASE >> 20)
+#define DRAM_SIZE	CONFIG_DRAM_SIZE_MB
+#define DRAM_END	(DRAM_START + DRAM_SIZE)	/* plus one... */
+
+enum {
+	L2CTLR_ECC_PARITY = 0x1 << 21,
+	L2CTLR_TAG_RAM_LATENCY_MASK = 0x7 << 6,
+	L2CTLR_TAG_RAM_LATENCY_CYCLES_3 = 2 << 6,
+	L2CTLR_DATA_RAM_LATENCY_MASK = 0x7 << 0,
+	L2CTLR_DATA_RAM_LATENCY_CYCLES_3  = 2 << 0
+};
+
+enum {
+	L2ACTLR_FORCE_L2_LOGIC_CLOCK_ENABLE_ACTIVE = 0x1 << 27,
+	L2ACTLR_ENABLE_HAZARD_DETECT_TIMEOUT = 0x1 << 7,
+	L2ACTLR_DISABLE_CLEAN_EVICT_PUSH_EXTERNAL = 0x1 << 3
+};
+
+/* Configures L2 Control Register to use 3 cycles for DATA/TAG RAM latency. */
+static void configure_l2ctlr(void)
+{
+   uint32_t val;
+
+   val = read_l2ctlr();
+   val &= ~(L2CTLR_DATA_RAM_LATENCY_MASK | L2CTLR_TAG_RAM_LATENCY_MASK);
+   val |= (L2CTLR_DATA_RAM_LATENCY_CYCLES_3 | L2CTLR_TAG_RAM_LATENCY_CYCLES_3 |
+	   L2CTLR_ECC_PARITY);
+   write_l2ctlr(val);
+}
+
+/* Configures L2 Auxiliary Control Register for Cortex A15. */
+static void configure_l2actlr(void)
+{
+   uint32_t val;
+
+   val = read_l2actlr();
+   val |= (L2ACTLR_DISABLE_CLEAN_EVICT_PUSH_EXTERNAL |
+	   L2ACTLR_ENABLE_HAZARD_DETECT_TIMEOUT |
+	   L2ACTLR_FORCE_L2_LOGIC_CLOCK_ENABLE_ACTIVE);
+   write_l2actlr(val);
+}
+
 void main(void)
 {
-	void *entry;
-	const struct device *soc;
-	const struct soc_nvidia_tegra124_config *config;
+	// Globally disable MMU, caches and branch prediction (these should
+	// already be disabled by default on reset).
+	uint32_t sctlr = read_sctlr();
+	sctlr &= ~(SCTLR_M | SCTLR_C | SCTLR_Z | SCTLR_I);
+	write_sctlr(sctlr);
+
+	arm_invalidate_caches();
+
+	// Renable icache and branch prediction.
+	sctlr = read_sctlr();
+	sctlr |= SCTLR_Z | SCTLR_I;
+	write_sctlr(sctlr);
+
+	configure_l2ctlr();
+	configure_l2actlr();
+
+	mmu_init();
+	mmu_config_range(0, DRAM_START, DCACHE_OFF);
+	mmu_config_range(DRAM_START, DRAM_SIZE, DCACHE_WRITEBACK);
+	mmu_config_range(CONFIG_DRAM_DMA_START >> 20,
+			 CONFIG_DRAM_DMA_SIZE >> 20, DCACHE_OFF);
+	mmu_config_range(DRAM_END, 4096 - DRAM_END, DCACHE_OFF);
+	mmu_disable_range(0, 1);
+	dcache_invalidate_all();
+	dcache_mmu_enable();
+
+	exception_init();
 
 	/* for quality of the user interface, it's important to get
 	 * the video going ASAP. Because there are long delays in some
@@ -39,15 +109,17 @@
 	 * early as we can in the RW stage, but never in the RO stage.
 	 */
 
-	soc = dev_find_slot(DEVICE_PATH_CPU_CLUSTER, 0);
+	const struct device *soc = dev_find_slot(DEVICE_PATH_CPU_CLUSTER, 0);
 	printk(BIOS_SPEW, "s%s: soc is %p\n", __func__, soc);
-	if (soc && soc->chip_info){
-		config = soc->chip_info;
+	if (soc && soc->chip_info) {
+		const struct soc_nvidia_tegra124_config *config =
+			soc->chip_info;
 		setup_display((struct soc_nvidia_tegra124_config *)config);
 	}
 
 	cbmem_initialize_empty();
 
-	entry = cbfs_load_stage(CBFS_DEFAULT_MEDIA, "fallback/coreboot_ram");
+	void *entry = cbfs_load_stage(CBFS_DEFAULT_MEDIA,
+				      "fallback/coreboot_ram");
 	stage_exit(entry);
 }
diff --git a/src/soc/nvidia/tegra124/Kconfig b/src/soc/nvidia/tegra124/Kconfig
index d0f6888..dc4b634 100644
--- a/src/soc/nvidia/tegra124/Kconfig
+++ b/src/soc/nvidia/tegra124/Kconfig
@@ -68,6 +68,11 @@
 	hex
 	default 0x800
 
+# TTB needs to be aligned to 16KB. Stick it in iRAM.
+config TTB_BUFFER
+	hex "memory address of the TTB buffer"
+	default 0x40000000
+
 config CBFS_CACHE_ADDRESS
 	hex "memory address to put CBFS cache data"
 	default 0x803c0000