Convert JSoup doc to string, apply Regex, and return a value in String

1

I'm using an example I found called AndroidJsoup to get the source HTML of a certain page, but I'm not getting just the code snippet I want that is in a certain%

In short, AndroidJsoup should run, taking <script> , applying HTML and returning regex

Follow my source along with the reference and example of the String resultado1 page to be taken. Also the HTML removed from my script regex .


Android MainActivity.java

package com.survivingwithandroid.jsoup;
import android.os.AsyncTask;
import android.os.Bundle;
import android.support.v7.app.ActionBarActivity;
import android.util.Log;
import android.view.Menu;
import android.view.MenuItem;
import android.view.View;
import android.widget.Button;
import android.widget.EditText;

import org.jsoup.Jsoup;
import org.jsoup.nodes.DataNode;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;


public class MainActivity extends ActionBarActivity {
private EditText respText;

@Override
protected void onCreate(Bundle savedInstanceState) {
    super.onCreate(savedInstanceState);
    setContentView(R.layout.activity_main);

    final EditText edtUrl = (EditText) findViewById(R.id.edtURL);
    Button btnGo = (Button) findViewById(R.id.btnGo);
    respText = (EditText) findViewById(R.id.edtResp);
    btnGo.setOnClickListener(new View.OnClickListener() {
        @Override
        public void onClick(View view) {
            String siteUrl = edtUrl.getText().toString();
            ( new ParseURL() ).execute(new String[]{siteUrl});
        }
    });
}


@Override
public boolean onCreateOptionsMenu(Menu menu) {
    // Inflate the menu; this adds items to the action bar if it is present.
    getMenuInflater().inflate(R.menu.main, menu);
    return true;
}

@Override
public boolean onOptionsItemSelected(MenuItem item) {
    // Handle action bar item clicks here. The action bar will
    // automatically handle clicks on the Home/Up button, so long
    // as you specify a parent activity in AndroidManifest.xml.
    int id = item.getItemId();
    if (id == R.id.action_settings) {
        return true;
    }
    return super.onOptionsItemSelected(item);
}

private class ParseURL extends AsyncTask<String, Void, String> {

    @Override
    protected String doInBackground(String... strings) {
        StringBuffer buffer = new StringBuffer();
        try {
            Log.d("JSwa", "Connecting to ["+strings[0]+"]");
            Document doc  = Jsoup.connect(strings[0]).get();
            Log.d("JSwa", "Connected to ["+strings[0]+"]");
            // Get document (HTML page) title
            String title = doc.title();
            Log.d("JSwA", "Title ["+title+"]");
            buffer.append("Title: " + title + "\r\n");

            // Get meta info
            Elements metaElems = doc.select("meta");
            buffer.append("META DATA\r\n");
            for (Element metaElem : metaElems) {
                String name = metaElem.attr("name");
                String content = metaElem.attr("content");
                buffer.append("name ["+name+"] - content ["+content+"] \r\n");
            }

            Elements topicList = doc.select("h2.topic");
            buffer.append("Topic list\r\n");
            for (Element topic : topicList) {
                String data = topic.text();

                buffer.append("Data [" + data + "] \r\n");
            }

            //==========
            Elements scriptElements = doc.getElementsByTag("script");
            buffer.append("Variavel resultado1\r\n");
            for (Element element :scriptElements ){
                for (DataNode node : element.dataNodes()) {
                    System.out.println(node.getWholeData());
                    String scriptdata = node.getWholeData();
                    buffer.append("StriptData [" + scriptdata + "] \r\n");
                    //String resultado1
                }
                System.out.println("-------------------");
            }
            //==========

        }
        catch(Throwable t) {
            t.printStackTrace();
        }

        return buffer.toString();
    }

    @Override
    protected void onPreExecute() {
        super.onPreExecute();
    }

    @Override
    protected void onPostExecute(String s) {
        super.onPostExecute(s);
        respText.setText(s);
    }
}
}

Home HTML example page

<html>

<head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <script type="text/javascript">
        function var1() {
            ...etc...
        }
    </script>
    <title>Link das Pessoas</title>
</head>

<body>
    <div>Conteudo</div>
    <script>
        function(...)
        etc valorM = (valores de xyz);
        etc valorE = (valores de xy);
        pegavalor(function() {
            ...funcoes_diversars(Conteudo dinamico e estatico...http://arquivosdofulano.com/pessoas 
                ...Conteudo dinamico e estatico)
        })
    </script>
    <div>Conteudo #2</div>
    <script type="text/javascript">
        var google...
    </script>

</html>


Regex to get the value of php :

/(([http]+[https]:\/\/)(.*?).(com\/pessoas))/
  

Note: I removed from php, I know if I need to change something in this regex to java.


If possible a code that would allow me to add other resultado1 to capture other values in a string. Ex regex ...


Source AndroidJsoup
Source Source Code

    
asked by anonymous 17.06.2015 / 02:13

1 answer

1

Considering that the content to be extracted is http://arquivosdofulano.com/pessoas we can create the regular expression as follows:

  • Start with http or https - (http|https) - The | character determines the OU operator;
  • Is followed by :// - :\/\/ - // 'escapes characters that are used in regular expressions;
  • Any text - .* - . literally means "any character" and * is the quantizer of zero or more;
  • .com/pessoas - \.com\/pessoas - // escapes characters that are used in regular expressions;

Adding all of this to the regular expression will be as follows:

(http|https):\/\/.*\.com\/pessoas

To separate this content we use the grouping that is demonstrated by () . Applying in your code the result will be as follows:

...
Pattern regex = Pattern.compile("((http|https):\/\/.*\.com\/pessoas)");
Matcher matcher = regex.matcher(scriptdata);

if (matcher.find()) {
  resultado1 = matcher.group(1);
}
...
    
03.02.2017 / 13:22