String Interpolation / Word Matching with XML in C#

Posted on

Problem

I’m working on a project where I need to map XML values to field names. Classic CS problem, I think I’ve got a pretty good approach but would like feedback.

For example: breachReportType matches to Breach Report Type:. However, there are similar matches like: breachType or breachReportCause.

The inputs are like …

  • Xml by Field Name: <xml><breachReportType>
  • Field Name by ID: [ { Key: '1234', Value: 'Breach Report Type' } ]
  • Content by Field ID: <xml><Field id='1234' Value='Email'>

    //  this.MapValues //->
    //      Fields Example: [{ Key: '12345', Value: 'Breach Report Type' }]
    //      XML Example: '<breachReportType />'    
    private Dictionary<string, string> MapValues(Dictionary<string, string> fields, string xml)
    {
        var map = new Dictionary<string, string>();
        var elements = XDocument.Parse(xml).Descendants();
    
        foreach (XElement elm in elements)
        {
            string name = elm.Name.ToString();
            string formattedName = this.SplitCamelCase(name);
            int currentMaxCount = 0;
            string bestMatchId = null;
    
            foreach (var field in fields)
            {
                string strippedFieldValue = field.Value.Substring(0, field.Value.Length - 1);
    
                // XML: breach Report Type
                // Field Name: Breach Type
                int match = this.MatchStrings(strippedFieldValue, formattedName);
    
                // once we find a match, update the index
                if (match > currentMaxCount)
                {
                    if (bestMatchId != null)
                    {
                        // get the previous one
                        string prevName;
                        fields.TryGetValue(bestMatchId, out prevName);
    
                        // compare the previous match
                        int prev = this.MatchStrings(prevName, formattedName);
    
                        // if the new match is greater than previous, update best match
                        // and the string match is closer to the character match
                        if (match > prev && ((strippedFieldValue.Length - match) < (prevName.Length - prev)))
                        {
                            bestMatchId = field.Key;
                            currentMaxCount = match;
                        }
                    }
                    else
                    {
                        bestMatchId = field.Key;
                        currentMaxCount = match;
                    }
                }
            }
    
            if (bestMatchId != null && !map.ContainsKey(bestMatchId))
            {
                map.Add(bestMatchId, name);
            }
        }
    
        return map;
    }
    
    // MatchStrings //->
    //      firstString Example: 'breachReportType'
    //      secondString Example: 'Breach Report Type'
    private int MatchStrings(string firstString, string secondString)
    {
        List<string> firstArr = firstString.Split(new[] { " " },
                    StringSplitOptions.RemoveEmptyEntries).ToList();
    
        List<string> secondArr = secondString.Split(new[] { " " },
            StringSplitOptions.RemoveEmptyEntries).ToList();
    
        int match = 0;
    
        foreach (string string1 in firstArr)
        {
            foreach (string string2 in secondArr)
            {
                if (string1.ToLower().IndexOf(string2.ToLower()) > -1)
                {
                    match = match + string2.Length;
                }
            }
        }
    
        return match;
    }
    
    // TranslateAndConcert //->
    //      Record Example: "<xml><Field id="1234" Value="Email">"
    //      FieldMap Example: [{ Key: '1234', 'breachReportType' }]
    private string TranslateAndConvert(string record, Dictionary<string, string> fieldMap)
    {
        XDocument mappedXml = new XDocument(new XElement("topmostSubform"));
    
        var elements = XDocument.Parse(XDocument.Parse(record).Root.Value)
            .Descendants("Field")
            .ToList();
    
        foreach (XElement elm in elements)
        {
            if (elm.Attribute("id") != null && elm.Attribute("value") != null)
            {
                // get the values from the xml
                string id = elm.Attribute("id").Value;
                string value = elm.Attribute("value").Value;
    
                // get the mapped name
                string name = null;
                fieldMap.TryGetValue(id, out name);
    
                // create new element
                if (name != null)
                {
                    var node = new XElement(name, value);
                    mappedXml.Root.Add(node);
                }
            }
        }
    
        return mappedXml.ToString();
    }
    
    // SplitCamelCase //->
    //      input example: 'breachReportType'
    private string SplitCamelCase(string input)
    {
        return System.Text.RegularExpressions.Regex.Replace(input, "([A-Z])", 
            " $1", System.Text.RegularExpressions.RegexOptions.Compiled).Trim();
    }
    

Any better ideas for an approach?

Solution

You could possibly improve it by using a fuzzy match algorithm such as the Levenshtein Distance Algorithm for your MatchStrings method. Then you could simply look for the smallest return value of all of the comparisons and use that as the best without having to do the extra length comparisons. This would also work better if there are some spelling variations in the data.

Leave a Reply

Your email address will not be published. Required fields are marked *