src/cpu/intel/slot_1/l2_cache.c - mirrors/cros/chromiumos/third_party/coreboot - Git at Google

 /*
  * This file is part of the coreboot project.
  *
  * Copyright (C) 2000 Denis Dowling <dpd@alphalink.com.au>
  * Copyright (C) 2010 Keith Hui <buurin@gmail.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  */

 /*
  * Intel Pentium L2 Cache initialization.
  * This code was developed by reverse engineering
  * the BIOS. Where the code accesses documented
  * registers I have added comments as best I can.
  * Some undocumented registers on the Pentium II are
  * used so some of the documentation is incomplete.
  *
  * References:
  * Intel Architecture Software Developer's Manual
  * Volume 3B: System Programming Guide, Part 2 (#253669)
  * Appendix B.9
  */

 /* This code is ported from coreboot v1.
  * The L2 cache initalization sequence here only apply to SECC/SECC2 P6 family
  * CPUs with Klamath (63x), Deschutes (65x) and Katmai (67x) cores.
  * It is not required for Coppermine (68x) and Tualatin (6bx) cores.
  * It is currently not known if Celerons with Mendocino (66x) core require the
  * special initialization.
  * Covington-core Celerons do not have L2 cache.
  */

 #include <stdint.h>
 #include <console/console.h>
 #include <string.h>
 #include <cpu/cpu.h>
 #include <cpu/x86/mtrr.h>
 #include <cpu/intel/l2_cache.h>
 #include <cpu/x86/cache.h>
 #include <cpu/x86/msr.h>

 /* Latency Tables */
 struct latency_entry {
 	u8 key;
 	u8 value;
 };
 /*
 Latency maps for Deschutes and Katmai.
 No such mapping is available for Klamath.

 Cache latency to
 be written to L2 -----++++
 control register      ||||
 0000 xx 00 -----> 000 cccc 0
 ||||    00 66MHz
 ||||    10 100MHz
 ||||    01 133MHz (Katmai "B" only)
 ++++------ CPU frequency multiplier

 0000 2x
 0001 3x
 0010 4x
 0011 5x
 0100 2.5x
 0101 3.5x
 0110 4.5x
 0111 5.5x
 1000 6x
 1001 7x
 1010 8x
 1011 Reserved
 1100 6.5x
 1101 7.5x
 1110 1.5x
 1111 2x

 */
 static const struct latency_entry latency_650_t0[] = {
 	{0x10, 0x02}, {0x50, 0x02}, {0x20, 0x04}, {0x60, 0x06},
 	{0x00, 0x08}, {0x40, 0x0C}, {0x12, 0x06}, {0x52, 0x0A},
 	{0x22, 0x0E}, {0x62, 0x10}, {0x02, 0x10}, {0xFF, 0x00}
 };

 static const struct latency_entry latency_650_t1[] = {
 	{0x12, 0x14}, {0x52, 0x16}, {0x22, 0x16}, {0x62, 0x16},
 	{0xFF, 0x00}
 };

 static const struct latency_entry latency_670_t0[] = {
 	{0x60, 0x06}, {0x00, 0x08}, {0x12, 0x06}, {0x52, 0x0A},
 	{0x22, 0x0E}, {0x62, 0x10}, {0x02, 0x10}, {0x42, 0x02},
 	{0x11, 0x0E}, {0x51, 0x0C}, {0x21, 0x02}, {0x61, 0x10},
 	{0x01, 0x10}, {0x41, 0x02}, {0xFF, 0x00}
 };

 static const struct latency_entry latency_670_t1[] = {
 	{0x22, 0x18}, {0x62, 0x18}, {0x02, 0x1A}, {0x11, 0x18},
 	{0xFF, 0x00}
 };

 static const struct latency_entry latency_670_t2[] = {
 	{0x22, 0x12}, {0x62, 0x14}, {0x02, 0x16}, {0x42, 0x1E},
 	{0x11, 0x12}, {0x51, 0x16}, {0x21, 0x1E}, {0x61, 0x14},
 	{0x01, 0x16}, {0x41, 0x1E}, {0xFF, 0x00}
 };

 /* Latency tables for 650 model/type */
 static const struct latency_entry *latency_650[] = {
 	latency_650_t0, latency_650_t1, latency_650_t1
 };

 /* Latency tables for 670 model/type */
 static const struct latency_entry *latency_670[] = {
 	latency_670_t0, latency_670_t1, latency_670_t2
 };

 int calculate_l2_latency(void)
 {
 	u32 eax, l, signature;
 	const struct latency_entry *latency_table, *le;
 	msr_t msr;

 	/* First, attempt to get cache latency value from
 	   IA32_PLATFORM_ID[56:53]. (L2 Cache Latency Read)
 	 */
 	msr = rdmsr(IA32_PLATFORM_ID);

 	printk(BIOS_DEBUG,"rdmsr(IA32_PLATFORM_ID) = %x:%x\n", msr.hi, msr.lo);

 	l = (msr.hi >> 20) & 0x1e;

 	if (l == 0) {
 		/* If latency value isn't available from
 		   IA32_PLATFORM_ID[56:53], read it from
 		   L2 control register 0 for lookup from
 		   tables. */
 		int t, a;

 		/* The raw code is read from L2 register 0, bits [7:4]. */
 		a = read_l2(0);
 		if (a < 0)
 			return -1;

 		a &= 0xf0;

 		if ((a & 0x20) == 0)
 			t = 0;
 		else if (a == 0x20)
 			t = 1;
 		else if (a == 0x30)
 			t = 2;
 		else
 			return -1;

 		printk(BIOS_DEBUG,"L2 latency type = %x\n", t);

 		/* Get CPUID family/model */
 		signature = cpuid_eax(1) & 0xfff0;

 		/* Read EBL_CR_POWERON */
 		msr = rdmsr(EBL_CR_POWERON);
 		/* Get clock multiplier and FSB frequency.
 		 * Multiplier is in [25:22].
 		 * FSB is in [19:18] in Katmai, [19] in Deschutes ([18] is zero for them).
 		 */
 		eax = msr.lo >> 18;
 		if (signature == 0x650) {
 			eax &= ~0xf2;
 			latency_table = latency_650[t];
 		} else if (signature == 0x670) {
 			eax &= ~0xf3;
 			latency_table = latency_670[t];
 		} else
 			return -1;

 		/* Search table for matching entry */
 		for (le = latency_table; le->key != eax; le++) {
 			/* Fail if we get to the end of the table */
 			if (le->key == 0xff) {
 				printk(BIOS_DEBUG, "Could not find key %02x in latency table\n", eax);
 				return -1;
 			}
 		}

 		l = le->value;
 	}

 	printk(BIOS_DEBUG,"L2 Cache latency is %d\n", l / 2);

 	/* Writes the calculated latency in BBL_CR_CTL3[4:1]. */
 	msr = rdmsr(BBL_CR_CTL3);
 	msr.lo &= 0xffffffe1;
 	msr.lo |= l;
 	wrmsr(BBL_CR_CTL3, msr);

 	return 0;
 }


 /* Setup address, data_high:data_low into the L2
  * control registers and then issue command with correct cache way
  */
 int signal_l2(u32 address, u32 data_high, u32 data_low, int way, u8 command)
 {
 	int i;
 	msr_t msr;

 	/* Write L2 Address to BBL_CR_ADDR */
 	msr.lo = address;
 	msr.hi = 0;
 	wrmsr(BBL_CR_ADDR, msr);

 	/* Write data to BBL_CR_D{0..3} */
 	msr.lo = data_low;
 	msr.hi = data_high;
 	for (i = BBL_CR_D0; i <= BBL_CR_D3; i++) {
 		wrmsr(i, msr);
 	}

 	/* Put the command and way into BBL_CR_CTL */
 	msr = rdmsr(BBL_CR_CTL);
 	msr.lo = (msr.lo & 0xfffffce0) | command | (way << 8);
 	wrmsr(BBL_CR_CTL, msr);

 	/* Trigger L2 controller */
 	msr.lo = 0;
 	msr.hi = 0;
 	wrmsr(BBL_CR_TRIG, msr);

 	/* Poll the controller to see when done */
 	for (i = 0; i < 0x100; i++) {
 		/* Read BBL_CR_BUSY */
 		msr = rdmsr(BBL_CR_BUSY);
 		/* If not busy then return */
 		if ((msr.lo & 1) == 0)
 			return 0;
 	}

 	/* Return timeout code */
 	return -1;
 }

 /* Read the L2 Cache controller register at given address */
 int read_l2(u32 address)
 {
 	msr_t msr;

 	/* Send a L2 Control Register Read to L2 controller */
 	if (signal_l2(address << 5, 0, 0, 0, L2CMD_CR) != 0)
 		return -1;

 	/* If OK then get the result from BBL_CR_ADDR */
 	msr = rdmsr(BBL_CR_ADDR);
 	return (msr.lo >> 0x15);

 }

 /* Write data into the L2 controller register at address */
 int write_l2(u32 address, u32 data)
 {
 	int v1, v2, i;

 	v1 = read_l2(0);
 	if (v1 < 0)
 		return -1;

 	v2 = read_l2(2);
 	if (v2 < 0)
 		return -1;

 	if ((v1 & 0x20) == 0) {
 		v2 &= 0x3;
 		v2++;
 	} else
 		v2 &= 0x7;

 	/* This write has to be replicated to a number of places. Not sure what. */

 	for (i = 0; i < v2; i++) {

 		u32 data1, data2;
 		// Bits legend
 		// data1   = ffffffff
 		// data2   = 000000dc
 		// address = 00aaaaaa
 		// Final address signalled:
 		// 000fffff fff000c0 000dcaaa aaa00000
 		data1 = data & 0xff;
 		data1 = data1 << 21;
 		data2 = (i << 11) & 0x1800;
 		data1 |= data2;
 		data2 <<= 6;
 		data2 &= 0x20000;
 		data1 |= data2;

 		/* Signal L2 controller */
 		if (signal_l2((address << 5) | data1, 0, 0, 0, 3))
 			return -1;
 	}
 	return 0;
 }

 /* Write data_high:data_low into the cache at address1. Test address2
  * to see if the same data is returned. Return 0 if the data matches.
  * return lower 16 bits if mismatched data if mismatch. Return -1
  * on error
  */
 int test_l2_address_alias(u32 address1, u32 address2,
 				 u32 data_high, u32 data_low)
 {
 	int d;
 	msr_t msr;

 	/* Tag Write with Data Write for L2 */
 	if (signal_l2(address1, data_high, data_low, 0, L2CMD_TWW))
 		return -1;

 	/* Tag Read with Data Read for L2 */
 	if (signal_l2(address2, 0, 0, 0, L2CMD_TRR))
 		return -1;

 	/* Read data from BBL_CR_D[0-3] */
 	for (d = BBL_CR_D0; d <= BBL_CR_D3; d++) {
 		msr = rdmsr(d);
 		if (msr.lo != data_low || msr.hi != data_high)
 			return (msr.lo & 0xffff);
 	}

 	return 0;
 }

 /* Calculates the L2 cache size.
  *
  * Reference: Intel(R) 64 and IA-32 Architectures Software Developers Manual
  *            Volume 3B: System Programming Guide, Part 2, Intel pub. 253669, pg. B-172.
  *
  */
 int calculate_l2_cache_size(void)
 {
 	int v;
 	msr_t msr;
 	u32 cache_setting;
 	u32 address, size, eax, bblcr3;

 	v = read_l2(0);
 	if (v < 0)
 		return -1;
 	if ((v & 0x20) == 0) {
 		msr = rdmsr(BBL_CR_CTL3);
 		bblcr3 = msr.lo & ~BBLCR3_L2_SIZE;
 		/*
 		 * Successively write in all the possible cache size per bank
 		 * into BBL_CR_CTL3[17:13], starting from 256KB (00001) to 4MB (10000),
 		 * and read the last value written and accepted by the cache.
 		 *
 		 * No idea why these bits are writable at all.
 		 */
 		for (cache_setting = BBLCR3_L2_SIZE_256K;
 		     cache_setting <= BBLCR3_L2_SIZE_4M; cache_setting <<= 1) {

 			eax = bblcr3 | cache_setting;
 			msr.lo = eax;
 			wrmsr(BBL_CR_CTL3, msr);
 			msr = rdmsr(BBL_CR_CTL3);

 			/* Value not accepted */
 			if (msr.lo != eax)
 				break;
 		}

 		/* Backtrack to the last value that worked... */
 		cache_setting >>= 1;

 		/* and write it into BBL_CR_CTL3 */
 		msr.lo &= ~BBLCR3_L2_SIZE;
 		msr.lo |= (cache_setting & BBLCR3_L2_SIZE);

 		wrmsr(BBL_CR_CTL3, msr);

 		printk(BIOS_DEBUG,"Maximum cache mask is %x\n", cache_setting);

 		/* For now, BBL_CR_CTL3 has the highest cache "size" that register
 		 * will accept. Now we'll ping the cache and see where it wraps.
 		 */

 		/* Write aaaaaaaa:aaaaaaaa to address 0 in the l2 cache.
 		 * If this "alias test" returns an "address", it means the
 		 * cache cannot be written to properly, and we have a problem.
 		 */
 		v = test_l2_address_alias(0, 0, 0xaaaaaaaa, 0xaaaaaaaa);
 		if (v != 0)
 			return -1;

 		/* Start with 32K wrap point (256KB actually) */
 		size = 1;
 		address = 0x8000;

 		while (1) {
 			v = test_l2_address_alias(address, 0, 0x55555555,
 						  0x55555555);
 			// Write failed.
 			if (v < 0)
 				return -1;
 			// It wraps here.
 			else if (v == 0)
 				break;

 			size <<= 1;
 			address <<= 1;

 			if (address > 0x200000)
 				return -1;
 		}

 		/* Mask size */
 		size &= 0x3e;

 		/* Shift to [17:13] */
 		size <<= 12;

 		/* Set this into BBL_CR_CTL3 */
 		msr = rdmsr(BBL_CR_CTL3);
 		msr.lo &= ~BBLCR3_L2_SIZE;
 		msr.lo |= size;
 		wrmsr(BBL_CR_CTL3, msr);

 		printk(BIOS_DEBUG,"L2 Cache Mask is %x\n", size);

 		/* Shift to [6:2] */
 		size >>= 11;

 		v = read_l2(2);

 		if (v < 0)
 			return -1;

 		printk(BIOS_DEBUG,"L2(2): %x ", v);

 		v &= 0x3;

 		/* Shift size right by v */
 		size >>= v;

 		/* Or in this size */
 		v |= size;

 		printk(BIOS_DEBUG,"-> %x\n", v);

 		if (write_l2(2, v) != 0)
 			return -1;
 	} else {
 		// Some cache size information is available from L2 registers.
 		// Work from there.
 		int b, c;

 		v = read_l2(2);

 		printk(BIOS_DEBUG,"L2(2) = %x\n", v);

 		if (v < 0)
 			return -1;

 		// L2 register 2 bitmap: cc---bbb
 		b = v & 0x7;
 		c = v >> 6;

 		v = 1 << c * b;

 		v &= 0xf;

 		printk(BIOS_DEBUG,"Calculated a = %x\n", v);

 		if (v == 0)
 			return -1;

 		/* Shift to 17:14 */
 		v <<= 14;

 		/* Write this size into BBL_CR_CTL3 */
 		msr = rdmsr(BBL_CR_CTL3);
 		msr.lo &= ~BBLCR3_L2_SIZE;
 		msr.lo |= v;
 		wrmsr(BBL_CR_CTL3, msr);
 	}

 	return 0;
 }

 // L2 physical address range can be found from L2 control register 3, bits [2:0].
 int calculate_l2_physical_address_range(void)
 {
 	int r0, r3;
 	msr_t msr;

 	r3 = read_l2(3);
 	if (r3 < 0)
 		return -1;

 	r0 = read_l2(0);
 	if (r0 < 0)
 		return -1;

 	if (r0 & 0x20)
 		r3 = 0x7;
 	else
 		r3 &= 0x7;

 	printk(BIOS_DEBUG,"L2 Physical Address Range is %dM\n", (1 << r3) * 512);

 	/* Shift into [22:20] to be saved into BBL_CR_CTL3. */
 	r3 = r3 << 20;

 	msr = rdmsr(BBL_CR_CTL3);
 	msr.lo &= ~BBLCR3_L2_PHYSICAL_RANGE;
 	msr.lo |= r3;
 	wrmsr(BBL_CR_CTL3, msr);

 	return 0;
 }

 int set_l2_ecc(void)
 {
 	u32 eax;
 	const u32 data1 = 0xaa55aa55;
 	const u32 data2 = 0xaaaaaaaa;
 	msr_t msr;

 	/* Set User Supplied ECC in BBL_CR_CTL */
 	msr = rdmsr(BBL_CR_CTL);
 	msr.lo |= BBLCR3_L2_SUPPLIED_ECC;
 	wrmsr(BBL_CR_CTL, msr);

 	/* Write a value into the L2 Data ECC register BBL_CR_DECC */
 	msr.lo = data1;
 	msr.hi = 0;
 	wrmsr(BBL_CR_DECC, msr);

 	if (test_l2_address_alias(0, 0, data2, data2) < 0)
 		return -1;

 	/* Read back ECC from BBL_CR_DECC */
 	msr = rdmsr(BBL_CR_DECC);
 	eax = msr.lo;

 	if (eax == data1) {
 		printk(BIOS_DEBUG,"L2 ECC Checking is enabled\n");

 		/* Set ECC Check Enable in BBL_CR_CTL3 */
 		msr = rdmsr(BBL_CR_CTL3);
 		msr.lo |= BBLCR3_L2_ECC_CHECK_ENABLE;
 		wrmsr(BBL_CR_CTL3, msr);
 	}

 	/* Clear User Supplied ECC in BBL_CR_CTL */
 	msr = rdmsr(BBL_CR_CTL);
 	msr.lo &= ~BBLCR3_L2_SUPPLIED_ECC;
 	wrmsr(BBL_CR_CTL, msr);

 	return 0;
 }

 /*
  * This is the function called from CPU initialization
  * driver to set up P6 family L2 cache.
  */

 int p6_configure_l2_cache(void)
 {
 	msr_t msr, bblctl3;
 	unsigned int eax;
 	u16 signature;
 	int cache_size, bank;
 	int result, calc_eax;
 	int v, a;

 	int badclk1, badclk2, clkratio;
 	int crctl3_or;

 	printk(BIOS_INFO, "Configuring L2 cache... ");

 	/* Read BBL_CR_CTL3 */
 	bblctl3 = rdmsr(BBL_CR_CTL3);
 	/* If bit 23 (L2 Hardware disable) is set then done */
 	/* These would be Covington core Celerons with no L2 cache */
 	if (bblctl3.lo & BBLCR3_L2_NOT_PRESENT) {
 		printk(BIOS_INFO,"hardware disabled\n");
 		return 0;
 	}

 	signature = cpuid_eax(1) & 0xfff0;

 	/* Klamath-specific bit settings for certain
 	   preliminary checks.
 	 */
 	if (signature == 0x630) {
 		clkratio = 0x1c00000;
 		badclk2 = 0x1000000;
 		crctl3_or = 0x44000;
 	} else {
 		clkratio = 0x3c00000;
 		badclk2 = 0x3000000;
 		crctl3_or = 0x40000;
 	}
 	badclk1 = 0xc00000;

 	/* Read EBL_CR_POWERON */
 	msr = rdmsr(EBL_CR_POWERON);
 	eax = msr.lo;
 	/* Mask out [22-25] Clock frequency ratio */
 	eax &= clkratio;
 	if (eax == badclk1 || eax == badclk2) {
 		printk(BIOS_ERR, "Incorrect clock frequency ratio %x\n", eax);
 		return -1;
 	}

 	disable_cache();

 	/* Mask out from BBL_CR_CTL3:
 	 * [0] L2 Configured
 	 * [5] ECC Check Enable
 	 * [6] Address Parity Check Enable
 	 * [7] CRTN Parity Check Enable
 	 * [8] L2 Enabled
 	 * [12:11] Number of L2 banks
 	 * [17:13] Cache size per bank
 	 * [18] (Set below)
 	 * [22:20] L2 Physical Address Range Support
 	 */
 	bblctl3.lo &= 0xff88061e;
 	/* Set:
 	 * [17:13] = 00010 = 512Kbyte Cache size per bank (63x)
 	 * [17:13] = 00000 = 128Kbyte Cache size per bank (all others)
 	 * [18] Cache state error checking enable
 	 */
 	bblctl3.lo |= crctl3_or;

 	/* Write BBL_CR_CTL3 */
 	wrmsr(BBL_CR_CTL3, bblctl3);

 	if (signature != 0x630) {
 		eax = bblctl3.lo;

 		/* Set the l2 latency in BBL_CR_CTL3 */
 		if (calculate_l2_latency() != 0)
 			goto bad;

 		/* Read the new latency values back */
 		bblctl3 = rdmsr(BBL_CR_CTL3);
 		calc_eax = bblctl3.lo;

 		/* Write back the original default value */
 		bblctl3.lo = eax;
 		wrmsr(BBL_CR_CTL3, bblctl3);

 		/* Write BBL_CR_CTL3[27:26] (reserved??) to bits [1:0] of L2 register 4.
 		 * Apparently all other bits must be preserved, hence these code.
 		 */

 		v = (calc_eax >> 26) & 0x3;

 		printk(BIOS_DEBUG,"write_l2(4, %x)\n", v);

 		a = read_l2(4);
 		if (a >= 0)
 		{
 			a &= 0xfffc;
 			a |= v;
 			a = write_l2(4, a);
 			/* a now contains result code from write_l2() */
 		}
 		if (a != 0)
 			goto bad;

 		/* Restore the correct latency value into BBL_CR_CTL3 */
 		bblctl3.lo = calc_eax;
 		wrmsr(BBL_CR_CTL3, bblctl3);
 	} /* ! 63x CPU */

 	/* Read L2 register 0 */
 	v = read_l2(0);

 	/* If L2(0)[5] set (and can be read properly), enable CRTN and address parity
 	 */
 	if (v >= 0 && (v & 0x20)) {
 		bblctl3 = rdmsr(BBL_CR_CTL3);
 		bblctl3.lo |= (BBLCR3_L2_ADDR_PARITY_ENABLE |
 		               BBLCR3_L2_CRTN_PARITY_ENABLE);
 		wrmsr(BBL_CR_CTL3, bblctl3);
 	}

 	/* If something goes wrong at L2 ECC setup, cache ECC
 	 * will just remain disabled.
 	 */
 	set_l2_ecc();

 	if (calculate_l2_physical_address_range() != 0) {
 		printk(BIOS_ERR, "Failed to calculate L2 physical address range");
 		goto bad;
 	}

 	if (calculate_l2_cache_size() != 0) {
 		printk(BIOS_ERR, "Failed to calculate L2 cache size");
 		goto bad;
 	}

 	/* Turn on cache. Only L1 is active at this time. */
 	enable_cache();

 	/* Get the calculated cache size from BBL_CR_CTL3[17:13] */
 	bblctl3 = rdmsr(BBL_CR_CTL3);
 	cache_size = (bblctl3.lo & BBLCR3_L2_SIZE);
 	if (cache_size == 0)
 		cache_size = 0x1000;
 	cache_size = cache_size << 3;

 	/* TODO: Cache size above is per bank. We're supposed to get
 	 * the number of banks from BBL_CR_CTL3[12:11].
 	 * Confirm that this still provides the correct answer.
 	 */
 	bank = (bblctl3.lo >> 11) & 0x3;
 	if (bank == 0)
 		bank = 1;

 	printk(BIOS_INFO, "size %dK... ", cache_size * bank * 4 / 1024);

 	/* Write to all cache lines to initialize */

 	while (cache_size > 0) {

 		/* Each cache line is 32 bytes. */
 		cache_size -= 32;

 		/* Update each way */

 		/* We're supposed to get L2 associativity from BBL_CR_CTL3[10:9].
 		 * But this code only applies to certain members of the P6 processor family
 		 * and since all P6 processors have 4-way L2 cache, we can safely assume
 		 * 4 way for all cache operations.
 		 */

 		for (v = 0; v < 4; v++) {
 			/* Send Tag Write w/Data Write (TWW) to L2 controller
 			 * MESI = Invalid
 			 */
 			if (signal_l2(cache_size, 0, 0, v, L2CMD_TWW | L2CMD_MESI_I) != 0) {
 				printk(BIOS_ERR, "Failed on signal_l2(%x, %x)\n",
 				       cache_size, v);
 				goto bad;
 			}
 		}
 	}
 	printk(BIOS_DEBUG, "L2 Cache lines initialized\n");

 	/* Disable cache */
 	disable_cache();

 	/* Set L2 cache configured in BBL_CR_CTL3 */
 	bblctl3 = rdmsr(BBL_CR_CTL3);
 	bblctl3.lo |= BBLCR3_L2_CONFIGURED;
 	wrmsr(BBL_CR_CTL3, bblctl3);

 	/* Invalidate cache and discard unsaved writes */
 	asm volatile ("invd");

 	/* Write 0 to L2 control register 5 */
 	if (write_l2(5, 0) != 0) {
 		printk(BIOS_ERR,"write_l2(5, 0) failed\n");
 		goto done;
 	}

 	bblctl3 = rdmsr(BBL_CR_CTL3);
 	if (signature == 0x650) {
 		/* Change the L2 latency to 0101 then back to
 		 * original value. I don't know why this is needed - dpd
 		 */
 		eax = bblctl3.lo;
 		bblctl3.lo &= ~BBLCR3_L2_LATENCY;
 		bblctl3.lo |= 0x0a;
 		wrmsr(BBL_CR_CTL3, bblctl3);
 		bblctl3.lo = eax;
 		wrmsr(BBL_CR_CTL3, bblctl3);
 	}

 	/* Enable L2 in BBL_CR_CTL3 */
 	bblctl3.lo |= BBLCR3_L2_ENABLED;
 	wrmsr(BBL_CR_CTL3, bblctl3);

 	/* Turn on cache. Both L1 and L2 are now active. Wahoo! */
 done:
 	result = 0;
 	goto out;
 bad:
 	result = -1;
 out:
 	printk(BIOS_INFO, "done.\n");
 	return result;
 }
	/*
	* This file is part of the coreboot project.
	*
	* Copyright (C) 2000 Denis Dowling <dpd@alphalink.com.au>
	* Copyright (C) 2010 Keith Hui <buurin@gmail.com>
	*
	* This program is free software; you can redistribute it and/or modify
	* it under the terms of the GNU General Public License as published by
	* the Free Software Foundation; either version 2 of the License, or
	* (at your option) any later version.
	*
	* This program is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	* GNU General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with this program; if not, write to the Free Software
	* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
	*/

	/*
	* Intel Pentium L2 Cache initialization.
	* This code was developed by reverse engineering
	* the BIOS. Where the code accesses documented
	* registers I have added comments as best I can.
	* Some undocumented registers on the Pentium II are
	* used so some of the documentation is incomplete.
	*
	* References:
	* Intel Architecture Software Developer's Manual
	* Volume 3B: System Programming Guide, Part 2 (#253669)
	* Appendix B.9
	*/

	/* This code is ported from coreboot v1.
	* The L2 cache initalization sequence here only apply to SECC/SECC2 P6 family
	* CPUs with Klamath (63x), Deschutes (65x) and Katmai (67x) cores.
	* It is not required for Coppermine (68x) and Tualatin (6bx) cores.
	* It is currently not known if Celerons with Mendocino (66x) core require the
	* special initialization.
	* Covington-core Celerons do not have L2 cache.
	*/

	#include <stdint.h>
	#include <console/console.h>
	#include <string.h>
	#include <cpu/cpu.h>
	#include <cpu/x86/mtrr.h>
	#include <cpu/intel/l2_cache.h>
	#include <cpu/x86/cache.h>
	#include <cpu/x86/msr.h>

	/* Latency Tables */
	struct latency_entry {
	u8 key;
	u8 value;
	};
	/*
	Latency maps for Deschutes and Katmai.
	No such mapping is available for Klamath.

	Cache latency to
	be written to L2 -----++++
	control register \|\|\|\|
	0000 xx 00 -----> 000 cccc 0
	\|\|\|\| 00 66MHz
	\|\|\|\| 10 100MHz
	\|\|\|\| 01 133MHz (Katmai "B" only)
	++++------ CPU frequency multiplier

	0000 2x
	0001 3x
	0010 4x
	0011 5x
	0100 2.5x
	0101 3.5x
	0110 4.5x
	0111 5.5x
	1000 6x
	1001 7x
	1010 8x
	1011 Reserved
	1100 6.5x
	1101 7.5x
	1110 1.5x
	1111 2x

	*/
	static const struct latency_entry latency_650_t0[] = {
	{0x10, 0x02}, {0x50, 0x02}, {0x20, 0x04}, {0x60, 0x06},
	{0x00, 0x08}, {0x40, 0x0C}, {0x12, 0x06}, {0x52, 0x0A},
	{0x22, 0x0E}, {0x62, 0x10}, {0x02, 0x10}, {0xFF, 0x00}
	};

	static const struct latency_entry latency_650_t1[] = {
	{0x12, 0x14}, {0x52, 0x16}, {0x22, 0x16}, {0x62, 0x16},
	{0xFF, 0x00}
	};

	static const struct latency_entry latency_670_t0[] = {
	{0x60, 0x06}, {0x00, 0x08}, {0x12, 0x06}, {0x52, 0x0A},
	{0x22, 0x0E}, {0x62, 0x10}, {0x02, 0x10}, {0x42, 0x02},
	{0x11, 0x0E}, {0x51, 0x0C}, {0x21, 0x02}, {0x61, 0x10},
	{0x01, 0x10}, {0x41, 0x02}, {0xFF, 0x00}
	};

	static const struct latency_entry latency_670_t1[] = {
	{0x22, 0x18}, {0x62, 0x18}, {0x02, 0x1A}, {0x11, 0x18},
	{0xFF, 0x00}
	};

	static const struct latency_entry latency_670_t2[] = {
	{0x22, 0x12}, {0x62, 0x14}, {0x02, 0x16}, {0x42, 0x1E},
	{0x11, 0x12}, {0x51, 0x16}, {0x21, 0x1E}, {0x61, 0x14},
	{0x01, 0x16}, {0x41, 0x1E}, {0xFF, 0x00}
	};

	/* Latency tables for 650 model/type */
	static const struct latency_entry *latency_650[] = {
	latency_650_t0, latency_650_t1, latency_650_t1
	};

	/* Latency tables for 670 model/type */
	static const struct latency_entry *latency_670[] = {
	latency_670_t0, latency_670_t1, latency_670_t2
	};

	int calculate_l2_latency(void)
	{
	u32 eax, l, signature;
	const struct latency_entry latency_table, le;
	msr_t msr;

	/* First, attempt to get cache latency value from
	IA32_PLATFORM_ID[56:53]. (L2 Cache Latency Read)
	*/
	msr = rdmsr(IA32_PLATFORM_ID);

	printk(BIOS_DEBUG,"rdmsr(IA32_PLATFORM_ID) = %x:%x\n", msr.hi, msr.lo);

	l = (msr.hi >> 20) & 0x1e;

	if (l == 0) {
	/* If latency value isn't available from
	IA32_PLATFORM_ID[56:53], read it from
	L2 control register 0 for lookup from
	tables. */
	int t, a;

	/* The raw code is read from L2 register 0, bits [7:4]. */
	a = read_l2(0);
	if (a < 0)
	return -1;

	a &= 0xf0;

	if ((a & 0x20) == 0)
	t = 0;
	else if (a == 0x20)
	t = 1;
	else if (a == 0x30)
	t = 2;
	else
	return -1;

	printk(BIOS_DEBUG,"L2 latency type = %x\n", t);

	/* Get CPUID family/model */
	signature = cpuid_eax(1) & 0xfff0;

	/* Read EBL_CR_POWERON */
	msr = rdmsr(EBL_CR_POWERON);
	/* Get clock multiplier and FSB frequency.
	* Multiplier is in [25:22].
	* FSB is in [19:18] in Katmai, [19] in Deschutes ([18] is zero for them).
	*/
	eax = msr.lo >> 18;
	if (signature == 0x650) {
	eax &= ~0xf2;
	latency_table = latency_650[t];
	} else if (signature == 0x670) {
	eax &= ~0xf3;
	latency_table = latency_670[t];
	} else
	return -1;

	/* Search table for matching entry */
	for (le = latency_table; le->key != eax; le++) {
	/* Fail if we get to the end of the table */
	if (le->key == 0xff) {
	printk(BIOS_DEBUG, "Could not find key %02x in latency table\n", eax);
	return -1;
	}
	}

	l = le->value;
	}

	printk(BIOS_DEBUG,"L2 Cache latency is %d\n", l / 2);

	/* Writes the calculated latency in BBL_CR_CTL3[4:1]. */
	msr = rdmsr(BBL_CR_CTL3);
	msr.lo &= 0xffffffe1;
	msr.lo \|= l;
	wrmsr(BBL_CR_CTL3, msr);

	return 0;
	}


	/* Setup address, data_high:data_low into the L2
	* control registers and then issue command with correct cache way
	*/
	int signal_l2(u32 address, u32 data_high, u32 data_low, int way, u8 command)
	{
	int i;
	msr_t msr;

	/* Write L2 Address to BBL_CR_ADDR */
	msr.lo = address;
	msr.hi = 0;
	wrmsr(BBL_CR_ADDR, msr);

	/* Write data to BBL_CR_D{0..3} */
	msr.lo = data_low;
	msr.hi = data_high;
	for (i = BBL_CR_D0; i <= BBL_CR_D3; i++) {
	wrmsr(i, msr);
	}

	/* Put the command and way into BBL_CR_CTL */
	msr = rdmsr(BBL_CR_CTL);
	msr.lo = (msr.lo & 0xfffffce0) \| command \| (way << 8);
	wrmsr(BBL_CR_CTL, msr);

	/* Trigger L2 controller */
	msr.lo = 0;
	msr.hi = 0;
	wrmsr(BBL_CR_TRIG, msr);

	/* Poll the controller to see when done */
	for (i = 0; i < 0x100; i++) {
	/* Read BBL_CR_BUSY */
	msr = rdmsr(BBL_CR_BUSY);
	/* If not busy then return */
	if ((msr.lo & 1) == 0)
	return 0;
	}

	/* Return timeout code */
	return -1;
	}

	/* Read the L2 Cache controller register at given address */
	int read_l2(u32 address)
	{
	msr_t msr;

	/* Send a L2 Control Register Read to L2 controller */
	if (signal_l2(address << 5, 0, 0, 0, L2CMD_CR) != 0)
	return -1;

	/* If OK then get the result from BBL_CR_ADDR */
	msr = rdmsr(BBL_CR_ADDR);
	return (msr.lo >> 0x15);

	}

	/* Write data into the L2 controller register at address */
	int write_l2(u32 address, u32 data)
	{
	int v1, v2, i;

	v1 = read_l2(0);
	if (v1 < 0)
	return -1;

	v2 = read_l2(2);
	if (v2 < 0)
	return -1;

	if ((v1 & 0x20) == 0) {
	v2 &= 0x3;
	v2++;
	} else
	v2 &= 0x7;

	/* This write has to be replicated to a number of places. Not sure what. */

	for (i = 0; i < v2; i++) {

	u32 data1, data2;
	// Bits legend
	// data1 = ffffffff
	// data2 = 000000dc
	// address = 00aaaaaa
	// Final address signalled:
	// 000fffff fff000c0 000dcaaa aaa00000
	data1 = data & 0xff;
	data1 = data1 << 21;
	data2 = (i << 11) & 0x1800;
	data1 \|= data2;
	data2 <<= 6;
	data2 &= 0x20000;
	data1 \|= data2;

	/* Signal L2 controller */
	if (signal_l2((address << 5) \| data1, 0, 0, 0, 3))
	return -1;
	}
	return 0;
	}

	/* Write data_high:data_low into the cache at address1. Test address2
	* to see if the same data is returned. Return 0 if the data matches.
	* return lower 16 bits if mismatched data if mismatch. Return -1
	* on error
	*/
	int test_l2_address_alias(u32 address1, u32 address2,
	u32 data_high, u32 data_low)
	{
	int d;
	msr_t msr;

	/* Tag Write with Data Write for L2 */
	if (signal_l2(address1, data_high, data_low, 0, L2CMD_TWW))
	return -1;

	/* Tag Read with Data Read for L2 */
	if (signal_l2(address2, 0, 0, 0, L2CMD_TRR))
	return -1;

	/* Read data from BBL_CR_D[0-3] */
	for (d = BBL_CR_D0; d <= BBL_CR_D3; d++) {
	msr = rdmsr(d);
	if (msr.lo != data_low \|\| msr.hi != data_high)
	return (msr.lo & 0xffff);
	}

	return 0;
	}

	/* Calculates the L2 cache size.
	*
	* Reference: Intel(R) 64 and IA-32 Architectures Software Developers Manual
	* Volume 3B: System Programming Guide, Part 2, Intel pub. 253669, pg. B-172.
	*
	*/
	int calculate_l2_cache_size(void)
	{
	int v;
	msr_t msr;
	u32 cache_setting;
	u32 address, size, eax, bblcr3;

	v = read_l2(0);
	if (v < 0)
	return -1;
	if ((v & 0x20) == 0) {
	msr = rdmsr(BBL_CR_CTL3);
	bblcr3 = msr.lo & ~BBLCR3_L2_SIZE;
	/*
	* Successively write in all the possible cache size per bank
	* into BBL_CR_CTL3[17:13], starting from 256KB (00001) to 4MB (10000),
	* and read the last value written and accepted by the cache.
	*
	* No idea why these bits are writable at all.
	*/
	for (cache_setting = BBLCR3_L2_SIZE_256K;
	cache_setting <= BBLCR3_L2_SIZE_4M; cache_setting <<= 1) {

	eax = bblcr3 \| cache_setting;
	msr.lo = eax;
	wrmsr(BBL_CR_CTL3, msr);
	msr = rdmsr(BBL_CR_CTL3);

	/* Value not accepted */
	if (msr.lo != eax)
	break;
	}

	/* Backtrack to the last value that worked... */
	cache_setting >>= 1;

	/* and write it into BBL_CR_CTL3 */
	msr.lo &= ~BBLCR3_L2_SIZE;
	msr.lo \|= (cache_setting & BBLCR3_L2_SIZE);

	wrmsr(BBL_CR_CTL3, msr);

	printk(BIOS_DEBUG,"Maximum cache mask is %x\n", cache_setting);

	/* For now, BBL_CR_CTL3 has the highest cache "size" that register
	* will accept. Now we'll ping the cache and see where it wraps.
	*/

	/* Write aaaaaaaa:aaaaaaaa to address 0 in the l2 cache.
	* If this "alias test" returns an "address", it means the
	* cache cannot be written to properly, and we have a problem.
	*/
	v = test_l2_address_alias(0, 0, 0xaaaaaaaa, 0xaaaaaaaa);
	if (v != 0)
	return -1;

	/* Start with 32K wrap point (256KB actually) */
	size = 1;
	address = 0x8000;

	while (1) {
	v = test_l2_address_alias(address, 0, 0x55555555,
	0x55555555);
	// Write failed.
	if (v < 0)
	return -1;
	// It wraps here.
	else if (v == 0)
	break;

	size <<= 1;
	address <<= 1;

	if (address > 0x200000)
	return -1;
	}

	/* Mask size */
	size &= 0x3e;

	/* Shift to [17:13] */
	size <<= 12;

	/* Set this into BBL_CR_CTL3 */
	msr = rdmsr(BBL_CR_CTL3);
	msr.lo &= ~BBLCR3_L2_SIZE;
	msr.lo \|= size;
	wrmsr(BBL_CR_CTL3, msr);

	printk(BIOS_DEBUG,"L2 Cache Mask is %x\n", size);

	/* Shift to [6:2] */
	size >>= 11;

	v = read_l2(2);

	if (v < 0)
	return -1;

	printk(BIOS_DEBUG,"L2(2): %x ", v);

	v &= 0x3;

	/* Shift size right by v */
	size >>= v;

	/* Or in this size */
	v \|= size;

	printk(BIOS_DEBUG,"-> %x\n", v);

	if (write_l2(2, v) != 0)
	return -1;
	} else {
	// Some cache size information is available from L2 registers.
	// Work from there.
	int b, c;

	v = read_l2(2);

	printk(BIOS_DEBUG,"L2(2) = %x\n", v);

	if (v < 0)
	return -1;

	// L2 register 2 bitmap: cc---bbb
	b = v & 0x7;
	c = v >> 6;

	v = 1 << c * b;

	v &= 0xf;

	printk(BIOS_DEBUG,"Calculated a = %x\n", v);

	if (v == 0)
	return -1;

	/* Shift to 17:14 */
	v <<= 14;

	/* Write this size into BBL_CR_CTL3 */
	msr = rdmsr(BBL_CR_CTL3);
	msr.lo &= ~BBLCR3_L2_SIZE;
	msr.lo \|= v;
	wrmsr(BBL_CR_CTL3, msr);
	}

	return 0;
	}

	// L2 physical address range can be found from L2 control register 3, bits [2:0].
	int calculate_l2_physical_address_range(void)
	{
	int r0, r3;
	msr_t msr;

	r3 = read_l2(3);
	if (r3 < 0)
	return -1;

	r0 = read_l2(0);
	if (r0 < 0)
	return -1;

	if (r0 & 0x20)
	r3 = 0x7;
	else
	r3 &= 0x7;

	printk(BIOS_DEBUG,"L2 Physical Address Range is %dM\n", (1 << r3) * 512);

	/* Shift into [22:20] to be saved into BBL_CR_CTL3. */
	r3 = r3 << 20;

	msr = rdmsr(BBL_CR_CTL3);
	msr.lo &= ~BBLCR3_L2_PHYSICAL_RANGE;
	msr.lo \|= r3;
	wrmsr(BBL_CR_CTL3, msr);

	return 0;
	}

	int set_l2_ecc(void)
	{
	u32 eax;
	const u32 data1 = 0xaa55aa55;
	const u32 data2 = 0xaaaaaaaa;
	msr_t msr;

	/* Set User Supplied ECC in BBL_CR_CTL */
	msr = rdmsr(BBL_CR_CTL);
	msr.lo \|= BBLCR3_L2_SUPPLIED_ECC;
	wrmsr(BBL_CR_CTL, msr);

	/* Write a value into the L2 Data ECC register BBL_CR_DECC */
	msr.lo = data1;
	msr.hi = 0;
	wrmsr(BBL_CR_DECC, msr);

	if (test_l2_address_alias(0, 0, data2, data2) < 0)
	return -1;

	/* Read back ECC from BBL_CR_DECC */
	msr = rdmsr(BBL_CR_DECC);
	eax = msr.lo;

	if (eax == data1) {
	printk(BIOS_DEBUG,"L2 ECC Checking is enabled\n");

	/* Set ECC Check Enable in BBL_CR_CTL3 */
	msr = rdmsr(BBL_CR_CTL3);
	msr.lo \|= BBLCR3_L2_ECC_CHECK_ENABLE;
	wrmsr(BBL_CR_CTL3, msr);
	}

	/* Clear User Supplied ECC in BBL_CR_CTL */
	msr = rdmsr(BBL_CR_CTL);
	msr.lo &= ~BBLCR3_L2_SUPPLIED_ECC;
	wrmsr(BBL_CR_CTL, msr);

	return 0;
	}

	/*
	* This is the function called from CPU initialization
	* driver to set up P6 family L2 cache.
	*/

	int p6_configure_l2_cache(void)
	{
	msr_t msr, bblctl3;
	unsigned int eax;
	u16 signature;
	int cache_size, bank;
	int result, calc_eax;
	int v, a;

	int badclk1, badclk2, clkratio;
	int crctl3_or;

	printk(BIOS_INFO, "Configuring L2 cache... ");

	/* Read BBL_CR_CTL3 */
	bblctl3 = rdmsr(BBL_CR_CTL3);
	/* If bit 23 (L2 Hardware disable) is set then done */
	/* These would be Covington core Celerons with no L2 cache */
	if (bblctl3.lo & BBLCR3_L2_NOT_PRESENT) {
	printk(BIOS_INFO,"hardware disabled\n");
	return 0;
	}

	signature = cpuid_eax(1) & 0xfff0;

	/* Klamath-specific bit settings for certain
	preliminary checks.
	*/
	if (signature == 0x630) {
	clkratio = 0x1c00000;
	badclk2 = 0x1000000;
	crctl3_or = 0x44000;
	} else {
	clkratio = 0x3c00000;
	badclk2 = 0x3000000;
	crctl3_or = 0x40000;
	}
	badclk1 = 0xc00000;

	/* Read EBL_CR_POWERON */
	msr = rdmsr(EBL_CR_POWERON);
	eax = msr.lo;
	/* Mask out [22-25] Clock frequency ratio */
	eax &= clkratio;
	if (eax == badclk1 \|\| eax == badclk2) {
	printk(BIOS_ERR, "Incorrect clock frequency ratio %x\n", eax);
	return -1;
	}

	disable_cache();

	/* Mask out from BBL_CR_CTL3:
	* [0] L2 Configured
	* [5] ECC Check Enable
	* [6] Address Parity Check Enable
	* [7] CRTN Parity Check Enable
	* [8] L2 Enabled
	* [12:11] Number of L2 banks
	* [17:13] Cache size per bank
	* [18] (Set below)
	* [22:20] L2 Physical Address Range Support
	*/
	bblctl3.lo &= 0xff88061e;
	/* Set:
	* [17:13] = 00010 = 512Kbyte Cache size per bank (63x)
	* [17:13] = 00000 = 128Kbyte Cache size per bank (all others)
	* [18] Cache state error checking enable
	*/
	bblctl3.lo \|= crctl3_or;

	/* Write BBL_CR_CTL3 */
	wrmsr(BBL_CR_CTL3, bblctl3);

	if (signature != 0x630) {
	eax = bblctl3.lo;

	/* Set the l2 latency in BBL_CR_CTL3 */
	if (calculate_l2_latency() != 0)
	goto bad;

	/* Read the new latency values back */
	bblctl3 = rdmsr(BBL_CR_CTL3);
	calc_eax = bblctl3.lo;

	/* Write back the original default value */
	bblctl3.lo = eax;
	wrmsr(BBL_CR_CTL3, bblctl3);

	/* Write BBL_CR_CTL3[27:26] (reserved??) to bits [1:0] of L2 register 4.
	* Apparently all other bits must be preserved, hence these code.
	*/

	v = (calc_eax >> 26) & 0x3;

	printk(BIOS_DEBUG,"write_l2(4, %x)\n", v);

	a = read_l2(4);
	if (a >= 0)
	{
	a &= 0xfffc;
	a \|= v;
	a = write_l2(4, a);
	/* a now contains result code from write_l2() */
	}
	if (a != 0)
	goto bad;

	/* Restore the correct latency value into BBL_CR_CTL3 */
	bblctl3.lo = calc_eax;
	wrmsr(BBL_CR_CTL3, bblctl3);
	} /* ! 63x CPU */

	/* Read L2 register 0 */
	v = read_l2(0);

	/* If L2(0)[5] set (and can be read properly), enable CRTN and address parity
	*/
	if (v >= 0 && (v & 0x20)) {
	bblctl3 = rdmsr(BBL_CR_CTL3);
	bblctl3.lo \|= (BBLCR3_L2_ADDR_PARITY_ENABLE \|
	BBLCR3_L2_CRTN_PARITY_ENABLE);
	wrmsr(BBL_CR_CTL3, bblctl3);
	}

	/* If something goes wrong at L2 ECC setup, cache ECC
	* will just remain disabled.
	*/
	set_l2_ecc();

	if (calculate_l2_physical_address_range() != 0) {
	printk(BIOS_ERR, "Failed to calculate L2 physical address range");
	goto bad;
	}

	if (calculate_l2_cache_size() != 0) {
	printk(BIOS_ERR, "Failed to calculate L2 cache size");
	goto bad;
	}

	/* Turn on cache. Only L1 is active at this time. */
	enable_cache();

	/* Get the calculated cache size from BBL_CR_CTL3[17:13] */
	bblctl3 = rdmsr(BBL_CR_CTL3);
	cache_size = (bblctl3.lo & BBLCR3_L2_SIZE);
	if (cache_size == 0)
	cache_size = 0x1000;
	cache_size = cache_size << 3;

	/* TODO: Cache size above is per bank. We're supposed to get
	* the number of banks from BBL_CR_CTL3[12:11].
	* Confirm that this still provides the correct answer.
	*/
	bank = (bblctl3.lo >> 11) & 0x3;
	if (bank == 0)
	bank = 1;

	printk(BIOS_INFO, "size %dK... ", cache_size * bank * 4 / 1024);

	/* Write to all cache lines to initialize */

	while (cache_size > 0) {

	/* Each cache line is 32 bytes. */
	cache_size -= 32;

	/* Update each way */

	/* We're supposed to get L2 associativity from BBL_CR_CTL3[10:9].
	* But this code only applies to certain members of the P6 processor family
	* and since all P6 processors have 4-way L2 cache, we can safely assume
	* 4 way for all cache operations.
	*/

	for (v = 0; v < 4; v++) {
	/* Send Tag Write w/Data Write (TWW) to L2 controller
	* MESI = Invalid
	*/
	if (signal_l2(cache_size, 0, 0, v, L2CMD_TWW \| L2CMD_MESI_I) != 0) {
	printk(BIOS_ERR, "Failed on signal_l2(%x, %x)\n",
	cache_size, v);
	goto bad;
	}
	}
	}
	printk(BIOS_DEBUG, "L2 Cache lines initialized\n");

	/* Disable cache */
	disable_cache();

	/* Set L2 cache configured in BBL_CR_CTL3 */
	bblctl3 = rdmsr(BBL_CR_CTL3);
	bblctl3.lo \|= BBLCR3_L2_CONFIGURED;
	wrmsr(BBL_CR_CTL3, bblctl3);

	/* Invalidate cache and discard unsaved writes */
	asm volatile ("invd");

	/* Write 0 to L2 control register 5 */
	if (write_l2(5, 0) != 0) {
	printk(BIOS_ERR,"write_l2(5, 0) failed\n");
	goto done;
	}

	bblctl3 = rdmsr(BBL_CR_CTL3);
	if (signature == 0x650) {
	/* Change the L2 latency to 0101 then back to
	* original value. I don't know why this is needed - dpd
	*/
	eax = bblctl3.lo;
	bblctl3.lo &= ~BBLCR3_L2_LATENCY;
	bblctl3.lo \|= 0x0a;
	wrmsr(BBL_CR_CTL3, bblctl3);
	bblctl3.lo = eax;
	wrmsr(BBL_CR_CTL3, bblctl3);
	}

	/* Enable L2 in BBL_CR_CTL3 */
	bblctl3.lo \|= BBLCR3_L2_ENABLED;
	wrmsr(BBL_CR_CTL3, bblctl3);

	/* Turn on cache. Both L1 and L2 are now active. Wahoo! */
	done:
	result = 0;
	goto out;
	bad:
	result = -1;
	out:
	printk(BIOS_INFO, "done.\n");
	return result;
	}