001
002
003
004
005
006
007
008
009
010
011
012
013
014
015
016
017
018
019
020
021
022
023
024
025
026
027
028
029
030
031
032
033
034
035
036
037
038
039
040
041
042
043
044
045
046
047
048
049
050
051
052
053
054
055
056
057
058
059
060
061
062
063
064
065
066
067
068
069
070
071
072
073
074
075
076
077
078
079
080
081
082
083
084
085
086
087
088
089
090
091
092
093
094
095
096
097
098
099
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
// Exercise 4.2.39 (Solution published at http://algs4.cs.princeton.edu/)
package algs42;
import stdlib.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import algs13.Queue;
import algs35.SET;
/* ***********************************************************************
 *  Compilation:  javac WebCrawler.java In.java
 *  Execution:    java WebCrawler url
 *
 *  Downloads the web page and prints out all urls on the web page.
 *  Gives an idea of how Google's spider crawls the web. Instead of
 *  looking for hyperlinks, we just look for patterns of the form:
 *  http:// followed by an alternating sequence of alphanumeric
 *  characters and dots, ending with a sequence of alphanumeric
 *  characters.
 *
 *  % java WebCrawler http://www.slashdot.org
 *  http://www.slashdot.org
 *  http://www.osdn.com
 *  http://sf.net
 *  http://thinkgeek.com
 *  http://freshmeat.net
 *  http://newsletters.osdn.com
 *  http://slashdot.org
 *  http://osdn.com
 *  http://ads.osdn.com
 *  http://sourceforge.net
 *  http://www.msnbc.msn.com
 *  http://www.rhythmbox.org
 *  http://www.apple.com
 *  ...
 *
 *  % java WebCrawler http://www.cs.princeton.edu
 *  http://www.cs.princeton.edu
 *  http://www.w3.org
 *  http://maps.yahoo.com
 *  http://www.princeton.edu
 *  http://www.Princeton.EDU
 *  http://ncstrl.cs.Princeton.EDU
 *  http://www.genomics.princeton.edu
 *  http://www.math.princeton.edu
 *  http://libweb.Princeton.EDU
 *  http://libweb2.princeton.edu
 *  http://www.acm.org
 *  ...
 *
 *
 *  Instead of setting the system property in the code, you could do it
 *  from the commandline
 *  % java -Dsun.net.client.defaultConnectTimeout=250 YYWebCrawler http://www.cs.princeton.edu

 *
 *************************************************************************/

public class XWebCrawler {

  public static void main(String[] args) {
    //args = new String[] { "http://www.slashdot.org" };
    //args = new String[] { "http://www.cdm.depaul.edu" };
    args = new String[] { "http://fpl.cs.depaul.edu/jriely" };

    // timeout connection after 500 miliseconds
    System.setProperty("sun.net.client.defaultConnectTimeout", "500");
    System.setProperty("sun.net.client.defaultReadTimeout",    "1000");

    // initial web page
    String s = args[0];

    // list of web pages to be examined
    Queue<String> queue = new Queue<>();
    queue.enqueue(s);

    // set of examined web pages
    SET<String> marked = new SET<>();
    marked.add(s);

    // breadth first search crawl of web
    while (!queue.isEmpty()) {
      String v = queue.dequeue();
      StdOut.println(v);

      In in = new In(v);

      // only needed in case website does not respond
      if (!in.exists()) continue;

      String input = in.readAll();
      if (input == null) continue;


      /* ***********************************************************
       *  Find links of the form: http://xxx.yyy.zzz
       *  \\w+ for one or more alpha-numeric characters
       *  \\. for dot
       *  could take first two statements out of loop
       *************************************************************/
      String regexp = "http://(\\w+\\.)*(\\w+)";
      Pattern pattern = Pattern.compile(regexp);
      Matcher matcher = pattern.matcher(input);

      // find and print all matches
      while (matcher.find()) {
        String w = matcher.group();
        if (!marked.contains(w)) {
          queue.enqueue(w);
          marked.add(w);
        }
      }

    }
  }
}