How to use HTML Agility pack?

4

How to use the HTML Agility Pack in my Visual Studio project in C #, since I have a table extracted by a webbrowsers object, more when I give some splits in it it gets as an array of almost 700 indexes, I would like to find the elements I want most easily,

 private void timer_loteca_Tick(object sender, EventArgs e)
    {
        //loteca
        WebBrowser clienteloteca = new WebBrowser();

        clienteloteca.Navigate("http://www1.caixa.gov.br/loterias/loterias/loteca/loteca_pesquisa_new.asp");

        clienteloteca.Navigated += clienteloteca_Navigated;

        timer_federal.Enabled = false;


    }

    void clienteloteca_Navigated(object sender, WebBrowserNavigatedEventArgs e)
    {

        try
        {

            var s = (WebBrowser)sender; ;
            string acumulou = string.Empty;

            var tabela = s.Document.Body.InnerHtml;

            string[] lala = tabela.Split('|');

            string[] line22 = Regex.Split(lala[3], "<table" );

            string line24 = line22[0].Replace("\r\n","");
            string[] line23 = Regex.Split(line24, "</TD>");

            var megasena = s.Document.Body.InnerText;


            string megasena1 = megasena;

            string[] lines = megasena1.Split('|');

            string[] line20 = Regex.Split(lines[3], "\r\n");

            string[] line30 = Regex.Split(lines[4], "\r\n");

            string res1 = line20[1].ToString().Substring(0,1);
            string res2 = line20[1].ToString().Substring(1);
            string res3 = line20[5].ToString().Substring(0, 1);
            string res4 = line20[5].ToString().Substring(1);
            string res5 = line20[9].ToString().Substring(0, 1);
            string res6 = line20[9].ToString().Substring(1);
            string res7 = line20[13].ToString().Substring(0, 1);
            string res8 = line20[13].ToString().Substring(1);
            string res9 = line20[17].ToString().Substring(0, 1);
            string res10 = line20[17].ToString().Substring(1);
            string res11 = line20[21].ToString().Substring(0, 1);
            string res12 = line20[21].ToString().Substring(1);
            string res13 = line20[25].ToString().Substring(0, 1);
            string res14 = line20[25].ToString().Substring(1);
            string res15 = line20[29].ToString().Substring(0, 1);
            string res16 = line20[29].ToString().Substring(1);
            string res17 = line20[33].ToString().Substring(0, 1);
            string res18 = line20[33].ToString().Substring(1);
}
    
asked by anonymous 24.02.2014 / 21:26

2 answers

1

I do not know what website you are trying to parse, but I see some errors in your code design:

  • First, you do not need to use the WebBrowser class. This class represents a graphical control to be used in a window. Instead, use the WebClient class to download the page.

  • Second, you do not need to use regex or split to parse the page ... that's what HtmlAgilityPack is for.

I've created an example of what your code looks like using this library:

//loteca
var client = new WebClient();

client.Headers[HttpRequestHeader.UserAgent] =
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) "
    +"Chrome/15.0.874.121 Safari/535.2";
client.Headers["Accept-Encoding"] = "gzip";

var html = client.DownloadString(
    "http://www1.caixa.gov.br/loterias/loterias/loteca/loteca_pesquisa_new.asp");

var htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(html);

// pegando uma lista com as tabelas da página
var todasAsTabelas = htmlDoc.DocumentNode.SelectNodes("table");

I personally do not have access to this site from my workplace so I could not test it ... but I can make an example with some other site without being www1.caixa.gov.br as example if needed.

    
25.02.2014 / 16:44
0

I was able to get the values in the table this way

private void timer_loteca_Tick(object sender, EventArgs e)
    {
        //loteca

        WebBrowser clienteloteca = new WebBrowser();

        if (timer_loteca.Enabled == true)
        {
            clienteloteca.Navigate("http://www1.caixa.gov.br/loterias/loterias/loteca/loteca_pesquisa_new.asp");
            timer_loteca.Enabled = false;
        }

        /*
        //Check if page is fully loaded or not
        while (clienteloteca.ReadyState != WebBrowserReadyState.Complete)
        {
            Application.DoEvents();

        }*/



            //Action to be taken on page loading completion




             clienteloteca.DocumentCompleted += clienteloteca_DocumentCompleted;



    }



public class JogosLoteca {
        public String Jogo { get; set; }            
        public String Time1 { get; set; }
        public String Resultado1 { get; set; }
        public String Time2 { get; set; }
        public String Resultado2 { get; set; }
        public String Data { get; set; }
    }

    public class ganhadoresloteca
    {
        public String faixa { get; set; }
        public String num_ganhadores { get; set; }
        public String val_premio { get; set; }

    }


clienteloteca_DocumentCompleted(object sender, System.Windows.Forms.WebBrowserDocumentCompletedEventArgs e)
    {
            var s = (WebBrowser)sender; ;
            string acumulou = string.Empty;

            //var tabela = s.Document.Body.InnerHtml;

            var mDocument = s.Document;
            var TabelaDeJogosLoteca = mDocument.GetElementById("tabela_jogo_loteca");
            List<JogosLoteca> jogos = new List<JogosLoteca>();


            for (var i = 0; i < TabelaDeJogosLoteca.Children[1].Children.Count; i++)
            {

                HtmlElement trElement = TabelaDeJogosLoteca.Children[1].Children[i];
                var coluna1 = trElement.Children[0].InnerText;
                var coluna2 = trElement.Children[1].InnerText;
                var coluna3 = trElement.Children[2].InnerText;
                var coluna4 = trElement.Children[3].InnerText;
                var coluna5 = trElement.Children[4].InnerText;
                var coluna6 = trElement.Children[5].InnerText;
                var coluna7 = trElement.Children[6].InnerText;
                jogos.Add(new JogosLoteca
                {

                    Jogo = coluna1,
                    Resultado1 = coluna2,
                    Time1 = coluna3,
                    Time2 = coluna5,
                    Resultado2 = coluna6,
                    Data = coluna7


                });
            }}

But the html agility pack was not used

    
25.02.2014 / 19:40