我正在解析一个大的XML文件~500MB,它包含一些无效的XML字符0x07
,所以你可以想象发生了什么,XMLReader抛出一个无效的XML字符异常,处理这个我们将Stream
流式传输到StreamReader
并使用Regex.Replace
并写了结果使用StreamWriter
并将干净的版本流回XMLReader
,现在我想避免这种情况并直接从XMLReader中跳过这个肮脏的标签,我的问题是如果无论如何要实现这一点,下面是我尝试的代码片段要做到这一点,但它在这条线var node = (XElement)XNode.ReadFrom(xr);
抛出异常
protected override IEnumerable<XElement> StreamReader(Stream stream, string elementName)
{
var arrTag = elementName.Split('|').ToList();
using (var xr = XmlReader.Create(stream, new XmlReaderSettings { CheckCharacters = false }))
{
while (xr.Read())
{
if (xr.NodeType == XmlNodeType.Element && arrTag.Contains(xr.Name))
{
var node = (XElement)XNode.ReadFrom(xr);
node.ReplaceWith(node.Elements().Where(e => e.Name != "DaylightSaveInfo"));
yield return node;
}
}
xr.Close();
}
}
XML SAMPLE,无效属性DaylightSaveInfo
<?xml version="1.0" encoding="ISO-8859-1"?>
<LATree>
<LA className="BTT00NE" fdn="NE=9739">
<attr name="fdn">NE=9739</attr>
<attr name="IP">10.157.144.100</attr>
<attr name="realLatitude">0D0'0"S</attr>
<attr name="realLongitude">0D0'0"W</attr>
<attr name="DaylightSaveInfo">NO</attr>
</LA>
</LATree>
我刚刚看到Jon Skeet写了一些关于这个的东西,所以我不能真正得到荣誉,但由于他在SO上的地位高于我的,我或许可以获得一两点来写它。 :)
首先,我编写了一个重载TextReader类的类。 (链接中的一些参考资料。)
https://www.w3.org/TR/xml/#NT-Char
https://github.com/Microsoft/referencesource/blob/master/mscorlib/system/io/textreader.cs
class FilterInvalidXmlReader : System.IO.TextReader
{
private System.IO.StreamReader _streamReader;
public System.IO.Stream BaseStream => _streamReader.BaseStream;
public FilterInvalidXmlReader(System.IO.Stream stream) => _streamReader = new System.IO.StreamReader(stream);
public override void Close() => _streamReader.Close();
protected override void Dispose(bool disposing) => _streamReader.Dispose();
public override int Peek()
{
var peek = _streamReader.Peek();
while (IsInvalid(peek, true))
{
_streamReader.Read();
peek = _streamReader.Peek();
}
return peek;
}
public override int Read()
{
var read = _streamReader.Read();
while (IsInvalid(read, true))
{
read = _streamReader.Read();
}
return read;
}
public static bool IsInvalid(int c, bool invalidateCompatibilityCharacters)
{
if (c == -1)
{
return false;
}
if (invalidateCompatibilityCharacters && ((c >= 0x7F && c <= 0x84) || (c >= 0x86 && c <= 0x9F) || (c >= 0xFDD0 && c <= 0xFDEF)))
{
return true;
}
if (c == 0x9 || c == 0xA || c == 0xD || (c >= 0x20 && c <= 0xD7FF) || (c >= 0xE000 && c <= 0xFFFD))
{
return false;
}
return true;
}
}
然后我创建了一个控制台应用程序,主要是我把:
using (var memoryStream = new System.IO.MemoryStream(System.Text.Encoding.UTF8.GetBytes("<Test><GoodAttribute>a\u0009b</GoodAttribute><BadAttribute>c\u0007d</BadAttribute></Test>")))
{
using (var xmlFilteredTextReader = new FilterInvalidXmlReader(memoryStream))
{
using (var xr = System.Xml.XmlReader.Create(xmlFilteredTextReader))
{
while (xr.Read())
{
if (xr.NodeType == System.Xml.XmlNodeType.Element)
{
var xe = System.Xml.Linq.XElement.ReadFrom(xr);
System.Console.WriteLine(xe.ToString());
}
}
}
}
}
希望这可能有所帮助,或者至少提供一些起点。
以下xml linq代码运行没有错误。我在xml文件中使用了“NO”:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Xml;
using System.Xml.Linq;
using System.IO;
namespace ConsoleApplication108
{
class Program
{
const string FILENAME = @"c:\temp\test.xml";
static void Main(string[] args)
{
XmlReaderSettings settings = new XmlReaderSettings();
settings.CheckCharacters = false;
XmlReader reader = XmlReader.Create(FILENAME, settings);
XDocument doc = XDocument.Load(reader);
Dictionary<string, string> dict = doc.Descendants("attr")
.GroupBy(x => (string)x.Attribute("name"), y => (string)y)
.ToDictionary(x => x.Key, y => y.FirstOrDefault());
}
}
}