How to Get all the links in a website

Sat, Nov 9, 2019

Read in 2 minutes

Using the Jsoup library, you can get all the links on a website.

Program for jdk7 and lower version :

package jsoup;
        import java.io.IOException;
        import java.util.ArrayList;
        import java.util.HashSet;
        import java.util.Set;
        import org.jsoup.Jsoup;
        import org.jsoup.nodes.Document;
        import org.jsoup.nodes.Element;
        import org.jsoup.select.Elements;

        public class ReadLinksUsingLowerVersion {
            public static Set<String> urlSet = new HashSet<String>();
             private void getLinks(String url) {
                try {
                    Document doc = Jsoup.connect(url).userAgent("Mozilla").get();
                    Elements links = doc.select("a[href]");
                    ArrayList<Element> arrayList = new ArrayList<Element>(links);
                    if (links.isEmpty()) {
                       return;
                    }
                    for(int i=0;i<arrayList.size();i++) {
                   Element     link = (Element) arrayList.get(i);
                         String  indigualURL =link.attr("abs:href");
                         boolean isURLAdded = urlSet.add(indigualURL);
                         String c="#";
                         if (isURLAdded && indigualURL.startsWith("https://theprogrammerguide.com/")
                                 && !(indigualURL.contains(c))
                                 &&  !(indigualURL.endsWith(".pdf"))) {
                           System.out.println("urls :: "+""+indigualURL);
                           getLinks(indigualURL);
                         }
                    }
                } catch (IOException ex) {
                }
            }
            public static void main(String[] args) {
                ReadLinksUsingLowerVersion obj = new ReadLinksUsingLowerVersion();
                obj.getLinks("https://theprogrammerguide.com/");
            }
        }

Program for jdk8 and higher version :

package jsoup;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
public class ReadLinksWithHigherVersion {
public static Set<String> urlSet = new HashSet<String>();
public static void main(String[] args) {
ReadLinksWithHigherVersion obj = new ReadLinksWithHigherVersion();
obj.getLinks("https://theprogrammerguide.com/");
               }
               private void getLinks(String url) {
                   try {
                       Document doc = Jsoup.connect(url).userAgent("Mozilla").get();
                       Elements links = doc.select("a[href]");
                       if (links.isEmpty()) {
                          return;
                       }
                     links.stream().map((link) -> link.attr("abs:href")).forEachOrdered((indigualURL) -> {
                           boolean add = urlSet.add(indigualURL);
                           String c="#";
                           if (add && indigualURL.startsWith("https://theprogrammerguide.com/")
                                   && !(indigualURL.contains(c))
                                   &&  !(indigualURL.endsWith(".pdf"))) {
                               System.out.println(indigualURL);
                               getLinks(indigualURL);
                           }
                       });
                   } catch (IOException ex) {
                   }
               }
           }