Problem
Note: This file is out of my hands. I cannot change the format or type of file I have to parse.
Here is some sample data that I’m trying to parse. This is information for just one player:
[0] (com.riotgames.platform.gameclient.domain::PlayerParticipantStatsSummary)#8
_profileIconId = 4
elo = 0
eloChange = 0
gameId = 82736631
gameItems = (null)
inChat = false
leaver = false
leaves = 1
level = 5
losses = 2
profileIconId = 4
skinName = "Vladimir"
statistics = (mx.collections::ArrayCollection)#9
filterFunction = (null)
length = 26
list = (mx.collections::ArrayList)#10
length = 26
source = (Array)#11
[0] (com.riotgames.platform.gameclient.domain::RawStatDTO)#12
displayName = (null)
statTypeName = "PHYSICAL_DAMAGE_TAKEN"
value = 11156
[1] (com.riotgames.platform.gameclient.domain::RawStatDTO)#13
displayName = (null)
statTypeName = "TOTAL_DAMAGE_TAKEN"
value = 20653
[2] (com.riotgames.platform.gameclient.domain::RawStatDTO)#14
displayName = (null)
statTypeName = "ITEM2"
value = 3158
[3] (com.riotgames.platform.gameclient.domain::RawStatDTO)#15
displayName = (null)
statTypeName = "ITEM4"
value = 3089
[4] (com.riotgames.platform.gameclient.domain::RawStatDTO)#16
displayName = (null)
statTypeName = "WIN"
value = 1
[5] (com.riotgames.platform.gameclient.domain::RawStatDTO)#17
displayName = (null)
statTypeName = "PHYSICAL_DAMAGE_DEALT_PLAYER"
value = 18413
[6] (com.riotgames.platform.gameclient.domain::RawStatDTO)#18
displayName = (null)
statTypeName = "TOTAL_HEAL"
value = 16877
[7] (com.riotgames.platform.gameclient.domain::RawStatDTO)#19
displayName = (null)
statTypeName = "ITEM0"
value = 3083
[8] (com.riotgames.platform.gameclient.domain::RawStatDTO)#20
displayName = (null)
statTypeName = "LARGEST_CRITICAL_STRIKE"
value = 173
[9] (com.riotgames.platform.gameclient.domain::RawStatDTO)#21
displayName = (null)
statTypeName = "ITEM3"
value = 3116
[10] (com.riotgames.platform.gameclient.domain::RawStatDTO)#22
displayName = (null)
statTypeName = "ITEM1"
value = 0
[11] (com.riotgames.platform.gameclient.domain::RawStatDTO)#23
displayName = (null)
statTypeName = "GOLD_EARNED"
value = 11123
[12] (com.riotgames.platform.gameclient.domain::RawStatDTO)#24
displayName = (null)
statTypeName = "ASSISTS"
value = 8
[13] (com.riotgames.platform.gameclient.domain::RawStatDTO)#25
displayName = (null)
statTypeName = "LARGEST_MULTI_KILL"
value = 2
[14] (com.riotgames.platform.gameclient.domain::RawStatDTO)#26
displayName = (null)
statTypeName = "MAGIC_DAMAGE_DEALT_PLAYER"
value = 103124
[15] (com.riotgames.platform.gameclient.domain::RawStatDTO)#27
displayName = (null)
statTypeName = "BARRACKS_KILLED"
value = 0
[16] (com.riotgames.platform.gameclient.domain::RawStatDTO)#28
displayName = (null)
statTypeName = "LARGEST_KILLING_SPREE"
value = 5
[17] (com.riotgames.platform.gameclient.domain::RawStatDTO)#29
displayName = (null)
statTypeName = "ITEM5"
value = 0
[18] (com.riotgames.platform.gameclient.domain::RawStatDTO)#30
displayName = (null)
statTypeName = "MINIONS_KILLED"
value = 176
[19] (com.riotgames.platform.gameclient.domain::RawStatDTO)#31
displayName = (null)
statTypeName = "CHAMPIONS_KILLED"
value = 11
[20] (com.riotgames.platform.gameclient.domain::RawStatDTO)#32
displayName = (null)
statTypeName = "MAGIC_DAMAGE_TAKEN"
value = 8581
[21] (com.riotgames.platform.gameclient.domain::RawStatDTO)#33
displayName = (null)
statTypeName = "NEUTRAL_MINIONS_KILLED"
value = 12
[22] (com.riotgames.platform.gameclient.domain::RawStatDTO)#34
displayName = (null)
statTypeName = "TOTAL_TIME_SPENT_DEAD"
value = 189
[23] (com.riotgames.platform.gameclient.domain::RawStatDTO)#35
displayName = (null)
statTypeName = "TURRETS_KILLED"
value = 0
[24] (com.riotgames.platform.gameclient.domain::RawStatDTO)#36
displayName = (null)
statTypeName = "NUM_DEATHS"
value = 5
[25] (com.riotgames.platform.gameclient.domain::RawStatDTO)#37
displayName = (null)
statTypeName = "TOTAL_DAMAGE_DEALT"
value = 121538
uid = "B6F2F234-2E37-D979-E896-AA7614FCE9CB"
sort = (null)
source = (Array)#11
summonerName = "WeFearTheSun"
teamId = 100
userId = 21672484
wins = 6
Now there are ten players in a game, meaning ten of that snippet above, each one with an increasing counter, [1],[2],[3], and so on.
As you can see inside of this data there are further numbers, ie. another [0]
.
If the numbers were unique I could just parse the [0]
, then the [1]
, then the [2]
, etc.
Right now I’m parsing the files like so:
private IEnumerable<Player> GetGamePlayers(string content)
{
List<Player> Players = new List<Player>();
var location = content.IndexOf("com.riotgames.platform.gameclient.domain::EndOfGameStats");
var cutContent = content.Substring(location, 65000);
var playerOneLocation = cutContent.IndexOf("[0]");
//var playerOneContent = cutContent.Substring(playerOneLocation, 6231);
var playerOneContent = cutContent.Substring(playerOneLocation, 6255);
Players.Add(ParsePlayer(playerOneContent));
cutContent = cutContent.Substring(playerOneLocation + 6229, cutContent.Length - playerOneLocation - 6229);
var playerTwoLocation = cutContent.IndexOf("[1]");
var playerTwoContent = cutContent.Substring(playerTwoLocation, 6255);
Players.Add(ParsePlayer(playerTwoContent));
cutContent = cutContent.Substring(playerTwoLocation + 6229, cutContent.Length - playerTwoLocation - 6229);
var playerThreeLocation = cutContent.IndexOf("[2]");
var playerThreeContent = cutContent.Substring(playerThreeLocation, 6255);
Players.Add(ParsePlayer(playerThreeContent));
cutContent = cutContent.Substring(playerThreeLocation + 6229, cutContent.Length - playerThreeLocation - 6229);
var playerFourLocation = cutContent.IndexOf("[3]");
var playerFourContent = cutContent.Substring(playerFourLocation, 6255);
Players.Add(ParsePlayer(playerFourContent));
cutContent = cutContent.Substring(playerFourLocation + 6229, cutContent.Length - playerFourLocation - 6229);
var playerFiveLocation = cutContent.IndexOf("[4]");
var playerFiveContent = cutContent.Substring(playerFiveLocation, 6255);
Players.Add(ParsePlayer(playerFiveContent));
location = cutContent.IndexOf("teamPlayerParticipantStats");
cutContent = cutContent.Substring(location, 32000);
var playerSixLocation = cutContent.IndexOf("[0]");
var playerSixContent = cutContent.Substring(playerSixLocation, 6255);
Players.Add(ParsePlayer(playerSixContent));
cutContent = cutContent.Substring(playerSixLocation + 6229, cutContent.Length - playerSixLocation - 6229);
var playerSevenLocation = cutContent.IndexOf("[1]");
var playerSevenContent = cutContent.Substring(playerSevenLocation, 6255);
Players.Add(ParsePlayer(playerSevenContent));
cutContent = cutContent.Substring(playerSevenLocation + 6229, cutContent.Length - playerSevenLocation - 6229);
var playerEightLocation = cutContent.IndexOf("[2]");
var playerEightContent = cutContent.Substring(playerEightLocation, 6255);
Players.Add(ParsePlayer(playerEightContent));
cutContent = cutContent.Substring(playerEightLocation + 6229, cutContent.Length - playerEightLocation - 6229);
var playerNineLocation = cutContent.IndexOf("[3]");
var playerNineContent = cutContent.Substring(playerNineLocation, 6255);
Players.Add(ParsePlayer(playerNineContent));
cutContent = cutContent.Substring(playerNineLocation + 6229, cutContent.Length - playerNineLocation - 6229);
var playerTenLocation = cutContent.IndexOf("[4]");
var playerTenContent = cutContent.Substring(playerTenLocation, 6255);
Players.Add(ParsePlayer(playerTenContent));
return Players;
}
I can tell the code is horrible, and there is a lot of room for improvement. How would I divide the huge content into single blocks for each individual player?
I need to get the string content:
[0] (com.riotgames.platform.gameclient.domain::PlayerParticipantStatsSummary)#8
to
[1] (com.riotgames.platform.gameclient.domain::PlayerParticipantStatsSummary)#8
then:
[1] (com.riotgames.platform.gameclient.domain::PlayerParticipantStatsSummary)#8
to
[2] (com.riotgames.platform.gameclient.domain::PlayerParticipantStatsSummary)#8
Solution
Some rules of thumb that should improve your code:
- If you have variables with names resembling
varOne
,varTwo
,varThree
, etc – no matter what the number you should probably be using an array. - If you are doing several identical actions over a sequence of values you should be using a loop.
- Code should never need to be repeated.
Now with these in mind let’s get to work.
private IEnumerable<Player> GetGamePlayers(string content)
{
List<Player> Players = new List<Player>();
// Get the endgame stats text
string endgameStatsMarker = "com.riotgames.platform.gameclient.domain::EndOfGameStats"
int endgameStatsLocation = content.IndexOf(endgameStatsMarker);
string endgameStats = content.Substring(endgameStatsLocation);
// Split the endgame stats text into an array of player information sections
string[] charBeginMarker = new string[] {"(com.riotgames.platform.gameclient.domain::PlayerParticipantStatsSummary)"}
string[] playerInfoSet = endgameStats.Split(charBeginMarker, SplitStringOptions.None);
// don't assume there's 10; get this from the logfile
int numPlayers = GetMaxPlayers();
// playerDataSet[0] is before any playerInfo begins, so we skip it.
for (int i = 1; i <= numPlayers; i++);
{
Players.Add(ParsePlayer(playerData));
// The string passed to ParsePlayer will be a little different now.
// It will contain a [#] on the end except in the case of the last player.
// This is since we split the text at the beginning of the player field
// and AFTER the [#].
// You will have to ensure your ParsePlayer method will work with this
// new input.
}
return Players;
}
… Well’s most of your code removed following some simple rules. The story of Bill Atkinson is probably quite relevant here. 😉
On better parsing
There is one large problem in your approach to file parsing I must talk to you about.
You are guessing how long the player data section is going to be based on the current logfile format’s appearance and hardcoding this into the program. Let’s examine this method’s faults. Tell me what happens when any of these things happen:
- Player data sections become one character longer.
- Player data sections get one one more line of information.
- Player data sections have another array introduced to them.
The answers (no peeking): 1. Your parsing of player data might go funny. 2. Your parsing will probably explode. 3. Your parsing will definitely explode in a horrible mess.
If you were parsing the file correctly, your program would be at least an order of magnitude less likely to stuff up in each case, until a major change is made to the logfile format.
The key to parsing this file is recognising that this file is full of human-readable and computer-readable markup you are not exploiting at all.
Here is what you are doing: Open file. Identify where sections split. Count the number of characters. Hardcode this into the program and extract those characters.
Here is what you should be doing: Open file. Identify where sections split. Determine how you identified where the sections split. Write code that is able to recognise this split, and splits the file content accordingly.
Thus, the key is: understanding how you understand what you see, then teaching your program to understand it as well!
For instance, consider these rules:
- A character information section begins at a particular string and ends at the next occurrence of the same string, so you can just split the character information sections on that string.
- Once you reach a
#
you’ve found a special field. It may be a simple comment or it may be an ID for each unique object, allowing reusing objects from cache in case two objects are identical. It is probably safe to ignore from a#
to the end of a line unless you find a use for it. - A variable can be found at the beginning of a line after any whitespace. The amount of whitespace is significant (see last point).
- If the first character found on a line is
[
it’s an element of an array, not a variable name. - A variable name ends at the first space.
- A variable’s value is on the other side of the
=
, removing any unnecessary spaces on the other side. - If a value starts with an integer it is an integer. If it starts with quotation marks it is a string.
- Variables will always contain an element ID then a type declaration. The variables that follow are probably the properties of an object of that type.
- An array item’s contents are always indented. An increase in indent depth means you’re now reading the contents of the array from the line above. A decrease in indent depth means you’ve finished reading the contents of that array.
Nine rules just from understanding and describing how I understand what I see. They should be fairly simple to implement in the parser in the form of methods – then all of a sudden your parser can already deal with 90% of the logfile’s syntax! Only a few things are missing at this point, such as parsing an array element’s type declaration or inferring the type of a variable’s value (string? integer?).