Highlight specific words in a sentence with diacritrics

Posted on

Problem

I am searching for some improvements, particularly in the regex, in the way I highlight specific words in a string.

  • I have keywords into my database stored without any diacritrics
  • The user comes with a string with diacritrics
  • I find the matches between the keywords and my string modified so that it does not contain any diacritrics
  • I highlight the matched words in the original sentence with diacritrics

 var defaultDiacriticsRemovalap = [
        {'base':'A', 'letters':'u0041u24B6uFF21u00C0u00C1u00C2u1EA6u1EA4u1EAAu1EA8u00C3u0100u0102u1EB0u1EAEu1EB4u1EB2u0226u01E0u00C4u01DEu1EA2u00C5u01FAu01CDu0200u0202u1EA0u1EACu1EB6u1E00u0104u023Au2C6F'},
        {'base':'AA','letters':'uA732'},
        {'base':'AE','letters':'u00C6u01FCu01E2'},
        {'base':'AO','letters':'uA734'},
        {'base':'AU','letters':'uA736'},
        {'base':'AV','letters':'uA738uA73A'},
        {'base':'AY','letters':'uA73C'},
        {'base':'B', 'letters':'u0042u24B7uFF22u1E02u1E04u1E06u0243u0182u0181'},
        {'base':'C', 'letters':'u0043u24B8uFF23u0106u0108u010Au010Cu00C7u1E08u0187u023BuA73E'},
        {'base':'D', 'letters':'u0044u24B9uFF24u1E0Au010Eu1E0Cu1E10u1E12u1E0Eu0110u018Bu018Au0189uA779'},
        {'base':'DZ','letters':'u01F1u01C4'},
        {'base':'Dz','letters':'u01F2u01C5'},
        {'base':'E', 'letters':'u0045u24BAuFF25u00C8u00C9u00CAu1EC0u1EBEu1EC4u1EC2u1EBCu0112u1E14u1E16u0114u0116u00CBu1EBAu011Au0204u0206u1EB8u1EC6u0228u1E1Cu0118u1E18u1E1Au0190u018E'},
        {'base':'F', 'letters':'u0046u24BBuFF26u1E1Eu0191uA77B'},
        {'base':'G', 'letters':'u0047u24BCuFF27u01F4u011Cu1E20u011Eu0120u01E6u0122u01E4u0193uA7A0uA77DuA77E'},
        {'base':'H', 'letters':'u0048u24BDuFF28u0124u1E22u1E26u021Eu1E24u1E28u1E2Au0126u2C67u2C75uA78D'},
        {'base':'I', 'letters':'u0049u24BEuFF29u00CCu00CDu00CEu0128u012Au012Cu0130u00CFu1E2Eu1EC8u01CFu0208u020Au1ECAu012Eu1E2Cu0197'},
        {'base':'J', 'letters':'u004Au24BFuFF2Au0134u0248'},
        {'base':'K', 'letters':'u004Bu24C0uFF2Bu1E30u01E8u1E32u0136u1E34u0198u2C69uA740uA742uA744uA7A2'},
        {'base':'L', 'letters':'u004Cu24C1uFF2Cu013Fu0139u013Du1E36u1E38u013Bu1E3Cu1E3Au0141u023Du2C62u2C60uA748uA746uA780'},
        {'base':'LJ','letters':'u01C7'},
        {'base':'Lj','letters':'u01C8'},
        {'base':'M', 'letters':'u004Du24C2uFF2Du1E3Eu1E40u1E42u2C6Eu019C'},
        {'base':'N', 'letters':'u004Eu24C3uFF2Eu01F8u0143u00D1u1E44u0147u1E46u0145u1E4Au1E48u0220u019DuA790uA7A4'},
        {'base':'NJ','letters':'u01CA'},
        {'base':'Nj','letters':'u01CB'},
        {'base':'O', 'letters':'u004Fu24C4uFF2Fu00D2u00D3u00D4u1ED2u1ED0u1ED6u1ED4u00D5u1E4Cu022Cu1E4Eu014Cu1E50u1E52u014Eu022Eu0230u00D6u022Au1ECEu0150u01D1u020Cu020Eu01A0u1EDCu1EDAu1EE0u1EDEu1EE2u1ECCu1ED8u01EAu01ECu00D8u01FEu0186u019FuA74AuA74C'},
        {'base':'OI','letters':'u01A2'},
        {'base':'OO','letters':'uA74E'},
        {'base':'OU','letters':'u0222'},
        {'base':'OE','letters':'u008Cu0152'},
        {'base':'oe','letters':'u009Cu0153'},
        {'base':'P', 'letters':'u0050u24C5uFF30u1E54u1E56u01A4u2C63uA750uA752uA754'},
        {'base':'Q', 'letters':'u0051u24C6uFF31uA756uA758u024A'},
        {'base':'R', 'letters':'u0052u24C7uFF32u0154u1E58u0158u0210u0212u1E5Au1E5Cu0156u1E5Eu024Cu2C64uA75AuA7A6uA782'},
        {'base':'S', 'letters':'u0053u24C8uFF33u1E9Eu015Au1E64u015Cu1E60u0160u1E66u1E62u1E68u0218u015Eu2C7EuA7A8uA784'},
        {'base':'T', 'letters':'u0054u24C9uFF34u1E6Au0164u1E6Cu021Au0162u1E70u1E6Eu0166u01ACu01AEu023EuA786'},
        {'base':'TZ','letters':'uA728'},
        {'base':'U', 'letters':'u0055u24CAuFF35u00D9u00DAu00DBu0168u1E78u016Au1E7Au016Cu00DCu01DBu01D7u01D5u01D9u1EE6u016Eu0170u01D3u0214u0216u01AFu1EEAu1EE8u1EEEu1EECu1EF0u1EE4u1E72u0172u1E76u1E74u0244'},
        {'base':'V', 'letters':'u0056u24CBuFF36u1E7Cu1E7Eu01B2uA75Eu0245'},
        {'base':'VY','letters':'uA760'},
        {'base':'W', 'letters':'u0057u24CCuFF37u1E80u1E82u0174u1E86u1E84u1E88u2C72'},
        {'base':'X', 'letters':'u0058u24CDuFF38u1E8Au1E8C'},
        {'base':'Y', 'letters':'u0059u24CEuFF39u1EF2u00DDu0176u1EF8u0232u1E8Eu0178u1EF6u1EF4u01B3u024Eu1EFE'},
        {'base':'Z', 'letters':'u005Au24CFuFF3Au0179u1E90u017Bu017Du1E92u1E94u01B5u0224u2C7Fu2C6BuA762'},
        {'base':'a', 'letters':'u0061u24D0uFF41u1E9Au00E0u00E1u00E2u1EA7u1EA5u1EABu1EA9u00E3u0101u0103u1EB1u1EAFu1EB5u1EB3u0227u01E1u00E4u01DFu1EA3u00E5u01FBu01CEu0201u0203u1EA1u1EADu1EB7u1E01u0105u2C65u0250'},
        {'base':'aa','letters':'uA733'},
        {'base':'ae','letters':'u00E6u01FDu01E3'},
        {'base':'ao','letters':'uA735'},
        {'base':'au','letters':'uA737'},
        {'base':'av','letters':'uA739uA73B'},
        {'base':'ay','letters':'uA73D'},
        {'base':'b', 'letters':'u0062u24D1uFF42u1E03u1E05u1E07u0180u0183u0253'},
        {'base':'c', 'letters':'u0063u24D2uFF43u0107u0109u010Bu010Du00E7u1E09u0188u023CuA73Fu2184'},
        {'base':'d', 'letters':'u0064u24D3uFF44u1E0Bu010Fu1E0Du1E11u1E13u1E0Fu0111u018Cu0256u0257uA77A'},
        {'base':'dz','letters':'u01F3u01C6'},
        {'base':'e', 'letters':'u0065u24D4uFF45u00E8u00E9u00EAu1EC1u1EBFu1EC5u1EC3u1EBDu0113u1E15u1E17u0115u0117u00EBu1EBBu011Bu0205u0207u1EB9u1EC7u0229u1E1Du0119u1E19u1E1Bu0247u025Bu01DD'},
        {'base':'f', 'letters':'u0066u24D5uFF46u1E1Fu0192uA77C'},
        {'base':'g', 'letters':'u0067u24D6uFF47u01F5u011Du1E21u011Fu0121u01E7u0123u01E5u0260uA7A1u1D79uA77F'},
        {'base':'h', 'letters':'u0068u24D7uFF48u0125u1E23u1E27u021Fu1E25u1E29u1E2Bu1E96u0127u2C68u2C76u0265'},
        {'base':'hv','letters':'u0195'},
        {'base':'i', 'letters':'u0069u24D8uFF49u00ECu00EDu00EEu0129u012Bu012Du00EFu1E2Fu1EC9u01D0u0209u020Bu1ECBu012Fu1E2Du0268u0131'},
        {'base':'j', 'letters':'u006Au24D9uFF4Au0135u01F0u0249'},
        {'base':'k', 'letters':'u006Bu24DAuFF4Bu1E31u01E9u1E33u0137u1E35u0199u2C6AuA741uA743uA745uA7A3'},
        {'base':'l', 'letters':'u006Cu24DBuFF4Cu0140u013Au013Eu1E37u1E39u013Cu1E3Du1E3Bu017Fu0142u019Au026Bu2C61uA749uA781uA747'},
        {'base':'lj','letters':'u01C9'},
        {'base':'m', 'letters':'u006Du24DCuFF4Du1E3Fu1E41u1E43u0271u026F'},
        {'base':'n', 'letters':'u006Eu24DDuFF4Eu01F9u0144u00F1u1E45u0148u1E47u0146u1E4Bu1E49u019Eu0272u0149uA791uA7A5'},
        {'base':'nj','letters':'u01CC'},
        {'base':'o', 'letters':'u006Fu24DEuFF4Fu00F2u00F3u00F4u1ED3u1ED1u1ED7u1ED5u00F5u1E4Du022Du1E4Fu014Du1E51u1E53u014Fu022Fu0231u00F6u022Bu1ECFu0151u01D2u020Du020Fu01A1u1EDDu1EDBu1EE1u1EDFu1EE3u1ECDu1ED9u01EBu01EDu00F8u01FFu0254uA74BuA74Du0275'},
        {'base':'oi','letters':'u01A3'},
        {'base':'ou','letters':'u0223'},
        {'base':'oo','letters':'uA74F'},
        {'base':'p','letters':'u0070u24DFuFF50u1E55u1E57u01A5u1D7DuA751uA753uA755'},
        {'base':'q','letters':'u0071u24E0uFF51u024BuA757uA759'},
        {'base':'r','letters':'u0072u24E1uFF52u0155u1E59u0159u0211u0213u1E5Bu1E5Du0157u1E5Fu024Du027DuA75BuA7A7uA783'},
        {'base':'s','letters':'u0073u24E2uFF53u00DFu015Bu1E65u015Du1E61u0161u1E67u1E63u1E69u0219u015Fu023FuA7A9uA785u1E9B'},
        {'base':'t','letters':'u0074u24E3uFF54u1E6Bu1E97u0165u1E6Du021Bu0163u1E71u1E6Fu0167u01ADu0288u2C66uA787'},
        {'base':'tz','letters':'uA729'},
        {'base':'u','letters': 'u0075u24E4uFF55u00F9u00FAu00FBu0169u1E79u016Bu1E7Bu016Du00FCu01DCu01D8u01D6u01DAu1EE7u016Fu0171u01D4u0215u0217u01B0u1EEBu1EE9u1EEFu1EEDu1EF1u1EE5u1E73u0173u1E77u1E75u0289'},
        {'base':'v','letters':'u0076u24E5uFF56u1E7Du1E7Fu028BuA75Fu028C'},
        {'base':'vy','letters':'uA761'},
        {'base':'w','letters':'u0077u24E6uFF57u1E81u1E83u0175u1E87u1E85u1E98u1E89u2C73'},
        {'base':'x','letters':'u0078u24E7uFF58u1E8Bu1E8D'},
        {'base':'y','letters':'u0079u24E8uFF59u1EF3u00FDu0177u1EF9u0233u1E8Fu00FFu1EF7u1E99u1EF5u01B4u024Fu1EFF'},
        {'base':'z','letters':'u007Au24E9uFF5Au017Au1E91u017Cu017Eu1E93u1E95u01B6u0225u0240u2C6CuA763'}
    ];

    var diacriticsMap = {};
    for (var i=0; i < defaultDiacriticsRemovalap.length; i++){
        var letters = defaultDiacriticsRemovalap[i].letters;
        for (var j=0; j < letters.length ; j++){
            diacriticsMap[letters[j]] = defaultDiacriticsRemovalap[i].base;
        }
    }

    // "what?" version ... http://jsperf.com/diacritics/12
    function removeDiacritics (str) {
        return str.replace(/[^u0000-u007E]/g, function(a){ 
           return diacriticsMap[a] || a; 
        });
    }

//Function to highlight matched keywords :
var sentence = "J'eusse été amélioré sans des aigÜs si hauts améliorés et raisonnable. Amélioration réaméliorée";
var output = sentence;
var word = ['eusse','ameliore','aigu','aimerais']; //Matched keywords extracted from database where there are stored without diatrics

var new_s = sentence, match, output, nb = 0, index = [];
var new_s_cache = removeDiacritics(sentence).replace("'"," ");
output = output + "<br />" + new_s_cache ;

for (var i=0;i<word.length;i++){// For each matched word it stores its position and length in index
  var wordInput = word[i].substr(0,4);
  var re = new RegExp("(?:^|\W)"+wordInput+"(\w*)(?!\w)","gi");
  while (match = re.exec(new_s_cache)) {
    index.push([match.index,match[0].length]);
  }
}
index.sort(); // Sort index so that it highlights word in the right order
for (var j=0;j<index.length;j++) { //It hightlights the word in the orginal sentence with diatrics
    new_s = new_s.substr(0,index[j][0]+1+nb) + "<b>" + new_s.substr(index[j][0]+1+nb,index[j][1]) + "</b>" + new_s.substr(index[j][0]+index[j][1]+1+nb);
    nb = nb + 7; // take into account the "<b></b>" length
}

output = output + "<br />" + new_s ;

document.getElementById('test').innerHTML = output;

Where I am seeking improvements:

  • The regex: I want to catch the similar words to keywords. That’s why I took arbitrary the 4 first letters and then run a regex to find the words that contains this 4 letters but it’s not optimal. Indeed for instance I’d like to find ‘unapproved’ when the keyword is ‘approve’.
  • The idea of the code: the juggle between the original sentence and the one without diacritrics appears to be quite fragile.

jsFiddle

Solution

You can avoid messing around with string indexes if you don’t try to normalize the text. Rather, you can transform the search terms into diacritic-insensitive regular expressions. For example, "eusse" could become /(?:e|é|É|è|È|ë|Ë)(?:u|ü|Ü)ss(?:e|é|É|è|È|ë|Ë)/. You can then apply that regular expression on the text directly.

function normalizedRegExpMaker(normalizationMap) {
    // From https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions
    function escapeRegExp(str) {
        return str.replace(/[.*+?^${}()|[]\]/g, "\$&");
    }

    // Invert the map, so that the values become keys, and the keys are listed
    // as values.
    var diacriticReverse = (function(equivs) {
        var reverse = {};
        for (c in equivs) {
            if (!reverse[equivs[c]]) {
                reverse[equivs[c]] = [equivs[c]];
            }
            reverse[equivs[c]].push(c);
        }
        return reverse;
    })(normalizationMap);

    var diacriticReverseRe = new RegExp(
        '(?:' + Object.keys(diacriticReverse).join('|') + ')',
        'g'
    );

    return function(s) {
        return escapeRegExp(s).replace(diacriticReverseRe, function(c) {
            return '(?:' + diacriticReverse[c].join('|') + ')';
        });
    };
}

function normalizedWordReplace(normalizationMap, words, text, transform) {
    var lettersRe = '[\w' + Object.keys(normalizationMap).join('') + ']*';
    var regexpMaker = normalizedRegExpMaker(normalizationMap);
    var re = new RegExp(
        lettersRe + '(?:' + words.map(regexpMaker).join('|') + ')' + lettersRe,
        'gi'
    );

    return text.replace(re, transform);
}

function bolden(s) {
    return '<b>' + s + '</b>';
}

var FRENCH_NORMALIZATION = {
    'ç': 'c',      'Ç': 'c',
    'à': 'a',      'À': 'a',
    'é': 'e',      'É': 'e',
    'è': 'e',      'È': 'e',
    'ë': 'e',      'Ë': 'e',
    'æ': 'ae',     'Æ': 'ae',
    'œ': 'oe',     'Œ': 'oe',
    'ü': 'u',      'Ü': 'u',
};

function search() {
    var searchTerms = $('#terms').val().split(/s+/);
    var text = $('#text').val();
    var output = normalizedWordReplace(FRENCH_NORMALIZATION, searchTerms, text, bolden);
    $('#output').html(output);
}
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min.js"></script>
<form>
  <label>Search terms (space-separated):<br>
    <input id="terms" size="80"
           value="eusse ameliore aigu aimerais">
  </label>
  <br>
  <label>Text:<br>
    <input id="text" size="80"
           value="J'eusse été amélioré sans des aigÜs si hauts améliorés et raisonnable. Amélioration réaméliorée">
  </label>
  <br>
  <button onclick="search()">Search</button>
</form>
<div id="output"></div>

Note that in this demo, as in yours, I am playing fast and loose with HTML escaping. If the text contains, say, a <script> tag, it will be injected into the resulting display.

You appear to have used an all-purpose normalization map. Note that there are likely language-specific rules. French treats diacritics as mere decorations. However, that isn’t the case in German, where ‘ü’ is the common way of writing “ue”.

Leave a Reply

Your email address will not be published. Required fields are marked *