Dom Crawler如何从网站检索文本

问题描述 投票:-4回答:1

我想从网站恢复文本和链接。我成功地检索了链接,但是我不知道如何从HTML检索文本。我需要一些诸如trim的功能来执行此调试,这是我的代码:

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.net.URL;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import java.io.IOException;



public class WebSpider {
    private final Set<URL> links;
    private final long startTime;
    private WebSpider(final URL startURL){
        this.links=new HashSet<>();
        this.startTime=System.currentTimeMillis();
        crawl(initURLS(startURL));

    }

    private void crawl(final Set<URL> urls) {
        if(!urls.isEmpty()){
            final Set<URL> newURLS = new HashSet<>();
             try{
              this.links.addAll(urls);
              for(final URL url:urls){
                  System.out.println("time=" + (System.currentTimeMillis()-this.startTime)+"connected to"+ url );
                  final Document document= Jsoup.connect(url.toString()).get();
                  final Elements linksOnPage= document.select("a[href]");
                  for(final Element element:linksOnPage){
                  final String urlText= element.attr("abs:href");
                  final URL discoveredURL =new URL(urlText);
                  newURLS.add(discoveredURL);
                  }
              }
             } catch (final Exception |Error ignored){
             }
                crawl(newURLS);
        }
    }

    private Set<URL> initURLS(final URL startURL) {
        return Collections.singleton(startURL);
    }
    public static void main(String[] args) throws IOException {
        final WebSpider spider;
        spider = new WebSpider(new URL("http://www.talkmorocco.net/"));
    }
}
java mongodb web web-crawler
1个回答
-1
投票
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.net.URL;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import java.io.IOException;



public class WebSpider {
    private final Set<URL> links;
    private final long startTime;
    private WebSpider(final URL startURL){
        this.links=new HashSet<>();
        this.startTime=System.currentTimeMillis();
        crawl(initURLS(startURL));

    }

    private void crawl(final Set<URL> urls) {
        if(!urls.isEmpty()){
            final Set<URL> newURLS = new HashSet<>();
             try{
              this.links.addAll(urls);
              for(final URL url:urls){
                  System.out.println("time=" + (System.currentTimeMillis()-this.startTime)+"connected to"+ url );
                  final Document document= Jsoup.connect(url.toString()).get();
                  final Elements linksOnPage= document.select("a[href]");
                  for(final Element element:linksOnPage){
                  final String urlText= element.attr("abs:href");
                  final URL discoveredURL =new URL(urlText);
                  newURLS.add(discoveredURL);
                  }
              }
             } catch (final Exception |Error ignored){
             }
                crawl(newURLS);
        }
    }

    private Set<URL> initURLS(final URL startURL) {
        return Collections.singleton(startURL);
    }
    public static void main(String[] args) throws IOException {
        final WebSpider spider;
        spider = new WebSpider(new URL("http://www.talkmorocco.net/"));
    }
}
© www.soinside.com 2019 - 2024. All rights reserved.