Problem
This code reads a .cs source file in the bin folder and parses the C# code in it. The program outputs keywords, identifiers, separators and numerical constants.
How could it be improved?
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace Tema3Compilatoare
{
class LexicalAnalysis
{
string[] keywords = { "abstract", "as", "base", "bool", "break", "by",
"byte", "case", "catch", "char", "checked", "class", "const",
"continue", "decimal", "default", "delegate", "do", "double",
"descending", "explicit", "event", "extern", "else", "enum",
"false", "finally", "fixed", "float", "for", "foreach", "from",
"goto", "group", "if", "implicit", "in", "int", "interface",
"internal", "into", "is", "lock", "long", "new", "null", "namespace",
"object", "operator", "out", "override", "orderby", "params",
"private", "protected", "public", "readonly", "ref", "return",
"switch", "struct", "sbyte", "sealed", "short", "sizeof",
"stackalloc", "static", "string", "select", "this",
"throw", "true", "try", "typeof", "uint", "ulong", "unchecked",
"unsafe", "ushort", "using", "var", "virtual", "volatile",
"void", "while", "where", "yield" };
string[] separator = { ";", "{","}","r","n","rn"};
string[] comments = { "//", "/*", "*/" };
string[] operators = { "+", "-", "*", "/", "%", "&","(",")","[","]",
"|", "^", "!", "~", "&&", "||",",",
"++", "--", "<<", ">>", "==", "!=", "<", ">", "<=",
">=", "=", "+=", "-=", "*=", "/=", "%=", "&=", "|=",
"^=", "<<=", ">>=", ".", "[]", "()", "?:", "=>", "??" };
public string Parse(string item)
{
StringBuilder str = new StringBuilder();
int ok;
if (Int32.TryParse(item, out ok))
{
str.Append("(numerical constant, " +item + ") ");
return str.ToString();
}
if (item.Equals("rn"))
{
return "rn";
}
if(CheckKeyword(item) ==true)
{
str.Append("(keyword, " + item + ") ");
return str.ToString();
}
if (CheckOperator(item) == true)
{
str.Append("(operator, " + item + ") ");
return str.ToString();
}
if (CheckDelimiter(item) == true)
{
str.Append("(separator, " + item + ") ");
return str.ToString();
}
str.Append("(identifier, " + item + ") ");
return str.ToString();
}
private bool CheckOperator(string str)
{
if (Array.IndexOf(operators, str) > -1)
return true;
return false;
}
private bool CheckDelimiter(string str)
{
if (Array.IndexOf(separator, str) > -1)
return true;
return false;
}
private bool CheckKeyword(string str)
{
if (Array.IndexOf(keywords, str) > -1)
return true;
return false;
}
private bool CheckComments(string str)
{
if (Array.IndexOf(comments, str) > -1)
return true;
return false;
}
public string GetNextLexicalAtom(ref string item)
{
StringBuilder token = new StringBuilder();
for (int i = 0; i < item.Length; i++)
{
if (CheckDelimiter(item[i].ToString()))
{
if (i + 1 < item.Length && CheckDelimiter(item.Substring(i, 2)))
{
token.Append(item.Substring(i, 2));
item = item.Remove(i, 2);
return Parse(token.ToString());
}
else
{
token.Append(item[i]);
item = item.Remove(i, 1);
return Parse(token.ToString());
}
}
else if (CheckOperator(item[i].ToString()))
{
if (i + 1 < item.Length && (CheckOperator(item.Substring(i, 2))))
if (i + 2 < item.Length && CheckOperator(item.Substring(i, 3)))
{
token.Append(item.Substring(i, 3));
item = item.Remove(i, 3);
return Parse(token.ToString());
}
else
{
token.Append(item.Substring(i, 2));
item = item.Remove(i, 2);
return Parse(token.ToString());
}
else if (CheckComments(item.Substring(i, 2)))
{
if(item.Substring(i,2).Equals("//"))
{
do
{
i++;
} while (item[i] != 'n');
item = item.Remove(0, i + 1);
item = item.Trim(' ', 't', 'r', 'n');
i = -1;
}
else
{
do
{
i++;
} while (item.Substring(i, 2).Equals("*/") == false);
item = item.Remove(0, i + 2);
item = item.Trim(' ', 't','r','n');
i = -1;
}
}
else
{
int ok;
if (item[i] == '-' && Int32.TryParse(item[i + 1].ToString(), out ok))
continue;
token.Append(item[i]);
item = item.Remove(i,1);
return Parse(token.ToString());
}
}
else
if (item[i] == ''' )
{
int j = i + 1;
if (item[j] == '\')
j += 2;
else
j++;
token.Append("(literal constant, ").Append(item.Substring(i, j - i + 1)).Append(") ");
item = item.Remove(i, j - i + 1);
return token.ToString();
}
else
if (item[i]=='"' )
{
int j = i + 1;
while (item[j] != '"' )
j++;
token.Append("(literal constant, ").Append(item.Substring(i, j - i+1)).Append(") ");
item = item.Remove(i, j - i + 1);
return token.ToString();
}
else
if(item[i+1].ToString().Equals(" ") || CheckDelimiter(item[i+1].ToString())==true || CheckOperator(item[i+1].ToString())==true)
{
if (Parse(item.Substring(0, i+1)).Contains("numerical constant") && item[i+1]=='.')
{
int j = i + 2;
while (item[j].ToString().Equals(" ") == false && CheckDelimiter(item[j].ToString()) == false && CheckOperator(item[j].ToString()) == false)
j++;
int ok;
if (Int32.TryParse(item.Substring(i + 2, j - i-2), out ok))
{
token.Append("(numerical constant, ").Append(item.Substring(0, j )).Append(") ");
item = item.Remove(0, j );
return token.ToString();
}
}
token.Append(item.Substring(0, i+1));
item=item.Remove(0, i+1);
return Parse(token.ToString());
}
}
return null;
}
}
class Program
{
static void Main(string[] args)
{
string text = System.IO.File.ReadAllText("Program.cs");
//string[] elements = text.Split(new char[] { ' ', 'r', 'n' },StringSplitOptions.RemoveEmptyEntries);
LexicalAnalysis analyzer = new LexicalAnalysis();
// analyzer.Parse(text);
while(text!=null)
{
text = text.Trim(' ','t');
string token=analyzer.GetNextLexicalAtom(ref text);
System.Console.Write(token);
}
//foreach (string item in elements)
//{
// analyzer.Parse(item);
//}
//System.Console.WriteLine(text);
System.Console.Read();
}
}
}
Solution
The keywords
, separator
, comments
, and operators
arrays could be static readonly
, so that they don’t need to be re-initialized for every instance of a LexicalAnalysis
class you create; the type would probably be better off as LexicalAnalyzer
though. The analyzer performs the analysis, but it is not “the analysis”.
Your private
members members are all explicitly private
, except these arrays. That’s an easily fixed inconsistency.
There’s a lot of this:
if (Array.IndexOf(comments, str) > -1)
return true;
return false;
That could be simply written as:
return comments.Contains(str);
Your public interface looks like this:
string Parse(string item);
string GetNextLexicalAtom(ref string item);
…which isn’t crystal-clear. What’s an item
? Is a “lexical atom” a token? Why is item
passed by reference?
Looking at the commented-out code in the Program
class, it seems you’re not sure either.
I like to design my objects starting with the public interface – if I like what I’m seeing then I go and implement it. In this case I’d probably have gone with something like this:
IParseTree Parse(string code);
What’s an IParseTree
? It’s what parsers typically return: a tree structure representing the code that was parsed. A Parse(string)
method that returns a string
is quite puzzling actually.
The more you’ll want to extend this code into an actual lexer/parser, the more crying the need will become to define a grammar, and generate the lexer/parser off the grammar rules. Look into Antl4 for C# if that’s an avenue you wish to explore.
Alternatively, if you’re on C# 6.0, you could leverage the Roslyn API and parse C# code using the actual C# compiler!
Here are some hints on how you might improve it:
- Change
Parse()
to return objects, not string – When you parse the text you should increase the information available for the next run through the list. One way to do this would be to addToken
‘s to a list. You could specialise so that you haveNumericalToken
,Separators
,KeywordToken
, … and so on. On your next run, you don’t need to parse the string over again, but have nice tokens with the associated readily available. -
Implement at switch to get type of token – Instead of having multiple if’s, I would look into having a something like a dictionary of all the matchable strings, where the value is the token type. This would allow for a
switch
statement before you create a token of appropriate type.If it doesn’t match anything, use the default clause (and possibly some extra tests) to add it as an identifier token.
And then I find GetNextLexicalAtom
just to find that this is called before the other, and does much of the same stuff… Strange… You should aim for one major parser, and let is select which tokens you find along the way, and then afterwards parse the token list into something sensible.
To get a proper token list you need to separate your string on any split between elements, which might be as simple as spaces and the delimiter list. Whenever you find either a delimiter or something else, add a token to the list.
Afterwards, you can parse the token list, and consume tokens depending on the context you are in. For example if you have a CommentStartToken
, you consume all tokens until the matching CommentEndToken
, similar for StringToken
and other tokens.
The main point is that after you’ve built a proper token list (not a string), it is rather easy to define your language as a sequence of given Tokens
.
A final point on your main program is that you read the entire file into memory, whilst it could be better to read and parse it line by line when translating into tokens. You’ll surely not get that large test files, but it is a better way to handle files normally.
Before publishing any of your code, let the IDE format it (Visual Studio: Ctrl+K, Ctrl+D
) so that it looks good at least on the lexical level.
Then, let your IDE help you with writing concise and consistent code. ReSharper from JetBrains has many inspections that look at your code and suggest improvements. One of them is to replace expr == true
with simply expr
, which has the same meaning. It can also fix them automatically for you.
Only after these two steps, your code is ready for review by other humans.