You are here: Home » .NET » Parsing HTML table to C# usable datalist

Parsing HTML table to C# usable datalist

Problem:

  • There’s no pre-developed .NET class that parse HTML syntax and do what is in the objective below. So it has to be custom made

Objective:

  • To be able to pass any HTML in form of string to a method, and have it return a list of rows in form of list of KeyValuePair

Technology Components:

  1.   C#
  2.   HTMLAgilityPack

Algorithm:
1) Table data will be stored in form of list of row List<List<KeyValuePair<string, string>>>

  • Where each row is in form of list of cell List<KeyValuePair<string, string>>
  • Where each cell is in form of KeyValuePair<string, string>
  • Key is the tag id
  • Value is the tag innerText

2) Loop through each TR or TH tag

  • Then loop through each TD tag
  •   Capture each tag id and innerText
  •   Store the data in form of KeyValuePair<string, string>

NOTE: Some of you might think why not use dictionary instead?

  • Well yes, I’ve tried the idea, though I have a need to get to the item, and replace the item using index instead of the key, so it’s best to use a list of KeyValuePair instead of a Dictionary of KeyValuePair

Code:


 


public static List<List<KeyValuePair<string, string>>> ParseHtmlToDataTable(string htmlString)
        {
            List<List<KeyValuePair<string, string>>> theList = new List<List<KeyValuePair<string, string>>>();
            if (!string.IsNullOrEmpty(htmlString))
            {
                bool IsNullEntireRow = true;
                string colSpanString;
                string ColInnerText;
                int colSpan = 0;
                string colSpanTag = "TH,TD";
                int rowSpanCount = 0;
                List<List<KeyValuePair<string, string>>> previousRowSpanList = new List<List<KeyValuePair<string, string>>>();
                List<KeyValuePair<string, string>> rowSpanCol = new List<KeyValuePair<string, string>>();

                var doc = new HtmlAgilityPack.HtmlDocument();
                doc.LoadHtml(htmlString);
                List<KeyValuePair<string, string>> theDict;
                List<KeyValuePair<string, string>> theRowSpanDict;
                var tRowList = doc.DocumentNode.SelectNodes("//tr");
                HtmlNode cell;
                int tCelCount = 0;
                int theDictCellIndex;

                bool isWorkingWithRowSpan = false;
                if (tRowList != null && tRowList.Count > 0)
                {
                    foreach (HtmlNode tRow in tRowList)
                    {
                        if (previousRowSpanList.Count > 0)
                        {
                            theDict = previousRowSpanList[0];
                            previousRowSpanList.Remove(theDict);        //remove it off the list
                            isWorkingWithRowSpan = true;
                        }
                        else
                        {
                            theDict = new List<KeyValuePair<string, string>>();
                            isWorkingWithRowSpan = false;
                        }
                        var tCellList = tRow.SelectNodes("td|th");
                        tCelCount = tCellList.Count;
                        if (tCelCount > 0 &&
                            !(tCelCount == 1 && string.IsNullOrEmpty(tCellList[0].InnerText.Trim()))
                            )
                        {
                            //colOrder = 1;
                            IsNullEntireRow = true;
                            for (int colIndex = 0; colIndex < tCelCount; colIndex++)
                            {
                                cell = tCellList[colIndex];
                                ColInnerText = cell.InnerText.Replace("&nbsp;", " ").Trim();
                                if (!string.IsNullOrEmpty(ColInnerText))
                                    IsNullEntireRow = false;

                                //Handle colspan
                                colSpanString = cell.Attributes["Colspan"] == null ? "" : cell.Attributes["Colspan"].Value;
                                if (colSpanTag.Contains(cell.Name.ToUpper()) && int.TryParse(colSpanString, out colSpan))//!string.IsNullOrEmpty(colSpanString) &&
                                {
                                    //Colspan column handler
                                    for (int i = 0; i < colSpan; i++)
                                    {
                                        if (isWorkingWithRowSpan)
                                        {
                                            theDictCellIndex = theDict.IndexOf(theDict.Where(item => string.IsNullOrEmpty(item.Key)).FirstOrDefault());
                                            if (theDictCellIndex >= 0)
                                            {
                                                theDict.RemoveAt(theDictCellIndex);     //Remove current emty column
                                                theDict.Insert(theDictCellIndex, new KeyValuePair<string, string>(cell.Id.ToString(), ColInnerText)); // Add new column
                                            }
                                        }
                                        else
                                            theDict.Add(new KeyValuePair<string, string>(cell.Id.ToString(), ColInnerText));

                                        //Add empty column for rowspan row
                                        if (previousRowSpanList.Count > 0 && previousRowSpanList[0].Count() < theDict.Count())
                                            previousRowSpanList.ForEach(item => item.Add(new KeyValuePair<string, string>("", "")));
                                    }
                                }
                                else
                                {
                                    //Not Colspan handler
                                    if (isWorkingWithRowSpan)
                                    {
                                        theDictCellIndex = theDict.IndexOf(theDict.Where(item => string.IsNullOrEmpty(item.Key)).FirstOrDefault());
                                        if (theDictCellIndex >= 0)
                                        {
                                            theDict.RemoveAt(theDictCellIndex);     //Remove current emty column
                                            theDict.Insert(theDictCellIndex, new KeyValuePair<string, string>(cell.Id.ToString(), ColInnerText)); // Add new column
                                        }
                                    }
                                    else
                                        theDict.Add(new KeyValuePair<string, string>(cell.Id.ToString(), ColInnerText));

                                    //Add empty column for rowspan row
                                    if (previousRowSpanList.Count > 0 && previousRowSpanList[0].Count() < theDict.Count())
                                        previousRowSpanList.ForEach(item => item.Add(new KeyValuePair<string, string>("", "")));
                                }

                                //Init RowSpan Tracking
                                if (cell.Attributes["RowSpan"] != null)
                                {
                                    rowSpanCount = 0;
                                    int.TryParse(cell.Attributes["RowSpan"].Value, out rowSpanCount);
                                    if (rowSpanCount > 1)
                                    {
                                        rowSpanCount--;     //Minus one because one is already added
                                        theRowSpanDict = new List<KeyValuePair<string, string>>();
                                        int theColSpanCount = 1;            //Colspan = 0 also means 1 for this situation
                                        if (colSpan > theColSpanCount)
                                            theColSpanCount = colSpan;
                                        int emptyColCount = theDict.Count - theColSpanCount; //colSpanCount;
                                        if (rowSpanCount > previousRowSpanList.Count())
                                        {
                                            //Add previousRowSpanList row if it doesn't exist
                                            for (int i = 0; i < rowSpanCount; i++)
                                            {
                                                theRowSpanDict = new List<KeyValuePair<string, string>>();
                                                if (theRowSpanDict.Count <= 0)
                                                {                                                                           
                                                    for (int thisCount = 0; thisCount < emptyColCount; thisCount++)
                                                    {
                                                        theRowSpanDict.Add(new KeyValuePair<string, string>("", ""));
                                                    }
                                                    for (int thisCount = emptyColCount; thisCount < theDict.Count; thisCount++)
                                                    {
                                                        var theDictCol = new KeyValuePair<string, string>(theDict[thisCount].Key, theDict[thisCount].Value);
                                                        theRowSpanDict.Add(theDictCol);
                                                    }
                                                }
                                                previousRowSpanList.Add(theRowSpanDict);
                                            }
                                        }
                                        else
                                        {
                                            //update previousRowSpanList columns with rowSpan Column if it exists
                                            foreach (var rowSpanDict in previousRowSpanList)
                                            {
                                                int thisCount;
                                                if (emptyColCount < theDict.Count)
                                                {
                                                    //Rowspan and ColsSpan
                                                    for (thisCount = emptyColCount; thisCount < theDict.Count; thisCount++)
                                                    {
                                                        var theDictCol = theDict[thisCount];
                                                        rowSpanDict.RemoveAt(thisCount);
                                                        rowSpanDict.Insert(thisCount, theDictCol);
                                                    }
                                                }
                                                else
                                                {
                                                    //Rowspan only
                                                    thisCount = theDict.Count - 1;
                                                    var theDictCol = theDict[thisCount];
                                                    rowSpanDict.RemoveAt(thisCount);
                                                    rowSpanDict.Insert(thisCount, theDictCol);
                                                }
                                            }
                                        }
                                    }
                                }
                            }

                            //Add new item only if row is not empty
                            if (!(theDict.Count == 1 && theDict.FirstOrDefault().Value == "") && !IsNullEntireRow)
                                theList.Add(theDict);
                        }
                    }
                }
            }

            return theList;
        }

Code Sample:  Click here to download code sample

– The sample code includes the implementation of MVVM Pattern, AvalonDock, HTMLAgilityPak, Custom DynamicGrid (dynamic datagrid) and of course the parsing logic methods above.

 

Instruction for trying out code sample

ParseHtml