How to extract information from an HTML document with Jsoup?

1

I've been studying the Jsoup sample for extracting data and extract an example from this Jsoup >

But I tried to manipulate the example to extract data from a Div instead of a Meta attribute and I could not.

I want to load posts that people post to a certain blog and load them on the App page.

Can anyone help me modify this code to get the DIV data.

package com.androidbegin.jsouptutorial;

import java.io.IOException;
import java.io.InputStream;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import android.os.AsyncTask;
import android.os.Bundle;
import android.app.Activity;
import android.app.ProgressDialog;
import android.graphics.Bitmap;
import android.graphics.BitmapFactory;
import android.view.View;
import android.view.View.OnClickListener;
import android.widget.Button;
import android.widget.ImageView;
import android.widget.TextView;

public class MainActivity extends Activity {

    // URL Address
    String url = "http://www.androidbegin.com";
    ProgressDialog mProgressDialog;

    @Override
    public void onCreate(Bundle savedInstanceState) {
        super.onCreate(savedInstanceState);
        setContentView(R.layout.activity_main);

        // Locate the Buttons in activity_main.xml
        Button titlebutton = (Button) findViewById(R.id.titlebutton);
        Button descbutton = (Button) findViewById(R.id.descbutton);
        Button logobutton = (Button) findViewById(R.id.logobutton);

        // Capture button click
        titlebutton.setOnClickListener(new OnClickListener() {
            public void onClick(View arg0) {
                // Execute Title AsyncTask
                new Title().execute();
            }
        });

        // Capture button click
        descbutton.setOnClickListener(new OnClickListener() {
            public void onClick(View arg0) {
                // Execute Description AsyncTask
                new Description().execute();
            }
        });

        // Capture button click
        logobutton.setOnClickListener(new OnClickListener() {
            public void onClick(View arg0) {
                // Execute Logo AsyncTask
                new Logo().execute();
            }
        });

    }

    // Title AsyncTask
    private class Title extends AsyncTask<Void, Void, Void> {
        String title;

        @Override
        protected void onPreExecute() {
            super.onPreExecute();
            mProgressDialog = new ProgressDialog(MainActivity.this);
            mProgressDialog.setTitle("Android Basic JSoup Tutorial");
            mProgressDialog.setMessage("Loading...");
            mProgressDialog.setIndeterminate(false);
            mProgressDialog.show();
        }

        @Override
        protected Void doInBackground(Void... params) {
            try {
                // Connect to the web site
                Document document = Jsoup.connect(url).get();
                // Get the html document title
                title = document.title();
            } catch (IOException e) {
                e.printStackTrace();
            }
            return null;
        }

        @Override
        protected void onPostExecute(Void result) {
            // Set title into TextView
            TextView txttitle = (TextView) findViewById(R.id.titletxt);
            txttitle.setText(title);
            mProgressDialog.dismiss();
        }
    }

    // Description AsyncTask
    private class Description extends AsyncTask<Void, Void, Void> {
        String desc;

        @Override
        protected void onPreExecute() {
            super.onPreExecute();
            mProgressDialog = new ProgressDialog(MainActivity.this);
            mProgressDialog.setTitle("Android Basic JSoup Tutorial");
            mProgressDialog.setMessage("Loading...");
            mProgressDialog.setIndeterminate(false);
            mProgressDialog.show();
        }

        @Override
        protected Void doInBackground(Void... params) {
            try {
                // Connect to the web site
                Document document = Jsoup.connect(url).get();
                // Using Elements to get the Meta data
                Elements description = document
                        .select("meta[name=description]");
                // Locate the content attribute
                desc = description.attr("content");
            } catch (IOException e) {
                e.printStackTrace();
            }
            return null;
        }

        @Override
        protected void onPostExecute(Void result) {
            // Set description into TextView
            TextView txtdesc = (TextView) findViewById(R.id.desctxt);
            txtdesc.setText(desc);
            mProgressDialog.dismiss();
        }
    }

    // Logo AsyncTask
    private class Logo extends AsyncTask<Void, Void, Void> {
        Bitmap bitmap;

        @Override
        protected void onPreExecute() {
            super.onPreExecute();
            mProgressDialog = new ProgressDialog(MainActivity.this);
            mProgressDialog.setTitle("Android Basic JSoup Tutorial");
            mProgressDialog.setMessage("Loading...");
            mProgressDialog.setIndeterminate(false);
            mProgressDialog.show();
        }

        @Override
        protected Void doInBackground(Void... params) {

            try {
                // Connect to the web site
                Document document = Jsoup.connect(url).get();
                // Using Elements to get the class data
                Elements img = document.select("a[class=brand brand-image] img[src]");
                // Locate the src attribute
                String imgSrc = img.attr("src");
                // Download image from URL
                InputStream input = new java.net.URL(imgSrc).openStream();
                // Decode Bitmap
                bitmap = BitmapFactory.decodeStream(input);

            } catch (IOException e) {
                e.printStackTrace();
            }
            return null;
        }

        @Override
        protected void onPostExecute(Void result) {
            // Set downloaded image into ImageView
            ImageView logoimg = (ImageView) findViewById(R.id.logo);
            logoimg.setImageBitmap(bitmap);
            mProgressDialog.dismiss();
        }
    }
}

The structure of the site where I want to extract the data looks like this:

<div class="postWrapper" id="post162">

   <div class="postTitle">
      <h2> Titulo do post </h2>

      <div class="fb-custom-share" data-url="http://url..."></div>

      <div class="date"> 26 de janeiro de 2015 </div>
   </div>
   <div class="postContent">
      Conteudo
   </div>
</div>
    
asked by anonymous 02.02.2015 / 18:01

1 answer

1

Just get all the elements with class postWrapper which is where the content of your interest is, for this there is the getElementsByClass method. Then just iterate the found elements (which is a Elements ) and access your children. Since everyone has classes, you can use the same method mentioned above to get them.

The only difference will be to get the link from that attribute data-* fb-custom-share . Where you will need to get the element by the class and then the attribute.

element.getElementsByClass("fb-custom-share").attr("data-url");


Document document = Jsoup.connect(URL).get();

Elements elements = document.getElementsByClass("postWrapper");

for(Element element : elements){
   String title = element.select(".postTitle > h2").text();
   String share = element.getElementsByClass("fb-custom-share").attr("data-url");
   String date = element.getElementsByClass("date").text();
   String content = element.getElementsByClass("postContent").text();

   // salva em um ArrayList<ArrayList> ou um Map...
}

A test code follows:

import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Main {
    public static void main(String[] args) throws IOException {

        // Estrutura do HTML postado na pergunta.
        StringBuilder html = new StringBuilder();
        html.append("<div class=\"postWrapper\" id=\"post162\">")
                .append("<div class=\"postTitle\">")
                    .append("<h2>Título A</h2>")
                    .append("<div class=\"fb-custom-share\" data-url=\"linkA\"></div>")
                    .append("<div class=\"date\"> 26 de janeiro de 2015 </div>")
                .append("</div>")
                .append("<div class=\"postContent\">")
                    .append("Conteúdo A")
                .append("</div>")
            .append("</div>");

        // Faz o parse da String e tenta transformá-la em um documento.
        Document document = Jsoup.parse(html.toString());

        Elements elements = document.getElementsByClass("postWrapper");

        for(Element element : elements){
            System.out.println("Título: " + element.select(".postTitle > h2").text() +
                               "Link de compartilhamento: " + element.getElementsByClass("fb-custom-share").attr("data-url") +
                               "Data: " + element.getElementsByClass("date").text() +
                               "Conteúdo: " + element.getElementsByClass("postContent").text());
        }
    }
}

output:

  

Title: Title A Share link: linkA
Date:   January 2015
Content: Content A

    
02.02.2015 / 21:57