Sat, Nov 9, 2019
Read in 2 minutes
Using the Jsoup library, you can get all the links on a website.
Program for jdk7 and lower version :
package jsoup;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Set;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class ReadLinksUsingLowerVersion {
public static Set<String> urlSet = new HashSet<String>();
private void getLinks(String url) {
try {
Document doc = Jsoup.connect(url).userAgent("Mozilla").get();
Elements links = doc.select("a[href]");
ArrayList<Element> arrayList = new ArrayList<Element>(links);
if (links.isEmpty()) {
return;
}
for(int i=0;i<arrayList.size();i++) {
Element link = (Element) arrayList.get(i);
String indigualURL =link.attr("abs:href");
boolean isURLAdded = urlSet.add(indigualURL);
String c="#";
if (isURLAdded && indigualURL.startsWith("https://theprogrammerguide.com/")
&& !(indigualURL.contains(c))
&& !(indigualURL.endsWith(".pdf"))) {
System.out.println("urls :: "+""+indigualURL);
getLinks(indigualURL);
}
}
} catch (IOException ex) {
}
}
public static void main(String[] args) {
ReadLinksUsingLowerVersion obj = new ReadLinksUsingLowerVersion();
obj.getLinks("https://theprogrammerguide.com/");
}
}
Program for jdk8 and higher version :
package jsoup;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
public class ReadLinksWithHigherVersion {
public static Set<String> urlSet = new HashSet<String>();
public static void main(String[] args) {
ReadLinksWithHigherVersion obj = new ReadLinksWithHigherVersion();
obj.getLinks("https://theprogrammerguide.com/");
}
private void getLinks(String url) {
try {
Document doc = Jsoup.connect(url).userAgent("Mozilla").get();
Elements links = doc.select("a[href]");
if (links.isEmpty()) {
return;
}
links.stream().map((link) -> link.attr("abs:href")).forEachOrdered((indigualURL) -> {
boolean add = urlSet.add(indigualURL);
String c="#";
if (add && indigualURL.startsWith("https://theprogrammerguide.com/")
&& !(indigualURL.contains(c))
&& !(indigualURL.endsWith(".pdf"))) {
System.out.println(indigualURL);
getLinks(indigualURL);
}
});
} catch (IOException ex) {
}
}
}