Problem
I’m working on a project where I need to map XML values to field names. Classic CS problem, I think I’ve got a pretty good approach but would like feedback.
For example: breachReportType
matches to Breach Report Type:
. However, there are similar matches like: breachType
or breachReportCause
.
The inputs are like …
- Xml by Field Name:
<xml><breachReportType>
- Field Name by ID:
[ { Key: '1234', Value: 'Breach Report Type' } ]
-
Content by Field ID:
<xml><Field id='1234' Value='Email'>
// this.MapValues //-> // Fields Example: [{ Key: '12345', Value: 'Breach Report Type' }] // XML Example: '<breachReportType />' private Dictionary<string, string> MapValues(Dictionary<string, string> fields, string xml) { var map = new Dictionary<string, string>(); var elements = XDocument.Parse(xml).Descendants(); foreach (XElement elm in elements) { string name = elm.Name.ToString(); string formattedName = this.SplitCamelCase(name); int currentMaxCount = 0; string bestMatchId = null; foreach (var field in fields) { string strippedFieldValue = field.Value.Substring(0, field.Value.Length - 1); // XML: breach Report Type // Field Name: Breach Type int match = this.MatchStrings(strippedFieldValue, formattedName); // once we find a match, update the index if (match > currentMaxCount) { if (bestMatchId != null) { // get the previous one string prevName; fields.TryGetValue(bestMatchId, out prevName); // compare the previous match int prev = this.MatchStrings(prevName, formattedName); // if the new match is greater than previous, update best match // and the string match is closer to the character match if (match > prev && ((strippedFieldValue.Length - match) < (prevName.Length - prev))) { bestMatchId = field.Key; currentMaxCount = match; } } else { bestMatchId = field.Key; currentMaxCount = match; } } } if (bestMatchId != null && !map.ContainsKey(bestMatchId)) { map.Add(bestMatchId, name); } } return map; } // MatchStrings //-> // firstString Example: 'breachReportType' // secondString Example: 'Breach Report Type' private int MatchStrings(string firstString, string secondString) { List<string> firstArr = firstString.Split(new[] { " " }, StringSplitOptions.RemoveEmptyEntries).ToList(); List<string> secondArr = secondString.Split(new[] { " " }, StringSplitOptions.RemoveEmptyEntries).ToList(); int match = 0; foreach (string string1 in firstArr) { foreach (string string2 in secondArr) { if (string1.ToLower().IndexOf(string2.ToLower()) > -1) { match = match + string2.Length; } } } return match; } // TranslateAndConcert //-> // Record Example: "<xml><Field id="1234" Value="Email">" // FieldMap Example: [{ Key: '1234', 'breachReportType' }] private string TranslateAndConvert(string record, Dictionary<string, string> fieldMap) { XDocument mappedXml = new XDocument(new XElement("topmostSubform")); var elements = XDocument.Parse(XDocument.Parse(record).Root.Value) .Descendants("Field") .ToList(); foreach (XElement elm in elements) { if (elm.Attribute("id") != null && elm.Attribute("value") != null) { // get the values from the xml string id = elm.Attribute("id").Value; string value = elm.Attribute("value").Value; // get the mapped name string name = null; fieldMap.TryGetValue(id, out name); // create new element if (name != null) { var node = new XElement(name, value); mappedXml.Root.Add(node); } } } return mappedXml.ToString(); } // SplitCamelCase //-> // input example: 'breachReportType' private string SplitCamelCase(string input) { return System.Text.RegularExpressions.Regex.Replace(input, "([A-Z])", " $1", System.Text.RegularExpressions.RegexOptions.Compiled).Trim(); }
Any better ideas for an approach?
Solution
You could possibly improve it by using a fuzzy match algorithm such as the Levenshtein Distance Algorithm for your MatchStrings method. Then you could simply look for the smallest return value of all of the comparisons and use that as the best without having to do the extra length comparisons. This would also work better if there are some spelling variations in the data.