repoman: check metadata.xml xml decl, bug #328113
diff --git a/bin/repoman b/bin/repoman
index 5618cf1..8c49c06 100755
--- a/bin/repoman
+++ b/bin/repoman
@@ -508,6 +508,9 @@
 	"dev-libs/libusb-compat":"virtual/libusb",
 }
 
+metadata_xml_encoding = 'UTF-8'
+metadata_xml_declaration = '<?xml version="1.0" encoding="%s"?>' % \
+	(metadata_xml_encoding,)
 metadata_doctype_name = 'pkgmetadata'
 metadata_dtd_uri = 'http://www.gentoo.org/dtd/metadata.dtd'
 # force refetch if the local copy creation time is older than this
@@ -1274,17 +1277,38 @@
 			v += "/"
 		thirdpartymirrors[v] = k
 
+class _XMLParser(xml.etree.ElementTree.XMLParser):
+
+	def __init__(self, data, **kwargs):
+		xml.etree.ElementTree.XMLParser.__init__(self, **kwargs)
+		self._portage_data = data
+		if hasattr(self, 'parser'):
+			self._base_XmlDeclHandler = self.parser.XmlDeclHandler
+			self.parser.XmlDeclHandler = self._portage_XmlDeclHandler
+			self._base_StartDoctypeDeclHandler = \
+				self.parser.StartDoctypeDeclHandler
+			self.parser.StartDoctypeDeclHandler = \
+				self._portage_StartDoctypeDeclHandler
+
+	def _portage_XmlDeclHandler(self, version, encoding, standalone):
+		if self._base_XmlDeclHandler is not None:
+			self._base_XmlDeclHandler(version, encoding, standalone)
+		self._portage_data["XML_DECLARATION"] = (version, encoding, standalone)
+
+	def _portage_StartDoctypeDeclHandler(self, doctypeName, systemId, publicId,
+		has_internal_subset):
+		if self._base_StartDoctypeDeclHandler is not None:
+			self._base_StartDoctypeDeclHandler(doctypeName, systemId, publicId,
+				has_internal_subset)
+		self._portage_data["DOCTYPE"] = (doctypeName, systemId, publicId)
+
 class _MetadataTreeBuilder(xml.etree.ElementTree.TreeBuilder):
 	"""
 	Implements doctype() as required to avoid deprecation warnings with
 	>=python-2.7.
 	"""
-	def __init__(self, data):
-		xml.etree.ElementTree.TreeBuilder.__init__(self)
-		self._portage_data = data
-
 	def doctype(self, name, pubid, system):
-		self._portage_data["DOCTYPE"] = (name, pubid, system)
+		pass
 
 try:
 	herd_base = make_herd_base(os.path.join(repoman_settings["PORTDIR"], "metadata/herds.xml"))
@@ -1644,43 +1668,68 @@
 	else:
 		metadata_bad = False
 		xml_info = {}
+		xml_parser = _XMLParser(xml_info, target=_MetadataTreeBuilder())
 
 		# read metadata.xml into memory
 		try:
 			_metadata_xml = xml.etree.ElementTree.parse(
 				_unicode_encode(os.path.join(checkdir, "metadata.xml"),
 				encoding=_encodings['fs'], errors='strict'),
-				parser=xml.etree.ElementTree.XMLParser(
-					target=_MetadataTreeBuilder(xml_info)))
+				parser=xml_parser)
 		except (ExpatError, SyntaxError, EnvironmentError) as e:
 			metadata_bad = True
 			stats["metadata.bad"] += 1
 			fails["metadata.bad"].append("%s/metadata.xml: %s" % (x, e))
 			del e
 		else:
-			if sys.hexversion < 0x2070000 or \
+			if not hasattr(xml_parser, 'parser') or \
+				sys.hexversion < 0x2070000 or \
 				(sys.hexversion > 0x3000000 and sys.hexversion < 0x3020000):
 				# doctype is not parsed with python 2.6 or 3.1
 				pass
-			elif "DOCTYPE" not in xml_info:
-				metadata_bad = True
-				stats["metadata.bad"] += 1
-				fails["metadata.bad"].append("%s/metadata.xml: %s" % (x,
-					"DOCTYPE is missing"))
 			else:
-				doctype_name, doctype_pubid, doctype_system = \
-					xml_info["DOCTYPE"]
-				if doctype_system != metadata_dtd_uri:
+				if "XML_DECLARATION" not in xml_info:
 					stats["metadata.bad"] += 1
 					fails["metadata.bad"].append("%s/metadata.xml: "
-						"DOCTYPE: SYSTEM should refer to '%s', not '%s'" %
-						(x, metadata_dtd_uri, doctype_system))
+						"xml declaration is missing on first line, "
+						"should be '%s'" % (x, metadata_xml_declaration))
+				else:
+					xml_version, xml_encoding, xml_standalone = \
+						xml_info["XML_DECLARATION"]
+					if xml_encoding is None or \
+						xml_encoding.upper() != metadata_xml_encoding:
+						stats["metadata.bad"] += 1
+						if xml_encoding is None:
+							encoding_problem = "but it is undefined"
+						else:
+							encoding_problem = "not '%s'" % xml_encoding
+						fails["metadata.bad"].append("%s/metadata.xml: "
+							"xml declaration encoding should be '%s', %s" %
+							(x, metadata_xml_encoding, encoding_problem))
 
-				if doctype_name != metadata_doctype_name:
+				if "DOCTYPE" not in xml_info:
+					metadata_bad = True
 					stats["metadata.bad"] += 1
-					fails["metadata.bad"].append("%s/metadata.xml: "
-						"DOCTYPE: name should be '%s', not '%s'" %
-						(x, metadata_doctype_name, doctype_name))
+					fails["metadata.bad"].append("%s/metadata.xml: %s" % (x,
+						"DOCTYPE is missing"))
+				else:
+					doctype_name, doctype_system, doctype_pubid = \
+						xml_info["DOCTYPE"]
+					if doctype_system != metadata_dtd_uri:
+						stats["metadata.bad"] += 1
+						if doctype_system is None:
+							system_problem = "but it is undefined"
+						else:
+							system_problem = "not '%s'" % doctype_system
+						fails["metadata.bad"].append("%s/metadata.xml: "
+							"DOCTYPE: SYSTEM should refer to '%s', %s" %
+							(x, metadata_dtd_uri, system_problem))
+
+					if doctype_name != metadata_doctype_name:
+						stats["metadata.bad"] += 1
+						fails["metadata.bad"].append("%s/metadata.xml: "
+							"DOCTYPE: name should be '%s', not '%s'" %
+							(x, metadata_doctype_name, doctype_name))
 
 			# load USE flags from metadata.xml
 			try: