|
|||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | ||||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Objectorg.matalon.pagerankhits.crawler.Crawler
This class is responsible for crawling the web according to the given parameters.
Field Summary | |
private int |
maxRetrievedUrlsNo
|
private java.lang.String |
startingAddress
|
private boolean |
stopCrawling
|
static Crawler |
theOnlyOne
|
private static Graph |
webGraph
|
Constructor Summary | |
private |
Crawler()
Private constructor. |
Method Summary | |
void |
crawl(java.lang.String startingAddress,
int maxRetrievedUrlsNo)
This method crawls over web pages on the Internet and extracts all thier links. |
static Crawler |
getInstance()
|
Graph |
getWebGraph()
|
boolean |
isStopCrawling()
|
static java.lang.String |
makeShorter(java.lang.String string,
int maxLength)
Shortens the given string to a maximum of |
private void |
prepareSummary(UrlsQueue visitedUrlsQueue)
Prepares a summary of crawler's progress; this summary is written to GeneralSettings.CRAWLING_PROGRESS_REPORT_FILE file. |
private static void |
printUrl(java.io.PrintWriter printWriter,
java.lang.String url,
int urlNo,
int maxUrlsNo)
Prints the given url to the given PrintWriter . |
private static void |
printUrlDetails(java.io.PrintWriter printWriter,
int urlNo,
WebPageProperties webPageProperties)
Prints the given URL details to the given PrintWriter . |
void |
run()
|
void |
stopCrawling()
Stops the Crawler. |
private static void |
updateRealTimeProgress(UrlsQueue urls2Print,
int currCrawledUrlNo,
int maxUrlsNo,
boolean lastUrl,
boolean stopCrawling)
Updates crawler's progress in real time to GeneralSettings.CRAWLING_PROGRESS_REPORT_FILE_TAIL file. |
private static boolean |
wasUrlVisited(java.lang.String url,
java.util.Map visitedUrlsMap)
|
Methods inherited from class java.lang.Object |
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
Field Detail |
public static Crawler theOnlyOne
private static Graph webGraph
private boolean stopCrawling
private java.lang.String startingAddress
private int maxRetrievedUrlsNo
Constructor Detail |
private Crawler()
Method Detail |
public static Crawler getInstance()
public void crawl(java.lang.String startingAddress, int maxRetrievedUrlsNo)
startingAddress
- maxRetrievedUrlsNo
- public void run()
run
in interface java.lang.Runnable
Runnable.run()
private static final boolean wasUrlVisited(java.lang.String url, java.util.Map visitedUrlsMap)
url
- visitedUrlsMap
-
url
was visited, false otherwise.private static final void updateRealTimeProgress(UrlsQueue urls2Print, int currCrawledUrlNo, int maxUrlsNo, boolean lastUrl, boolean stopCrawling)
urls2Print
- currCrawledUrlNo
- maxUrlsNo
- lastUrl
- stopCrawling
- private static final void printUrl(java.io.PrintWriter printWriter, java.lang.String url, int urlNo, int maxUrlsNo)
url
to the given PrintWriter
.
printWriter
- url
- urlNo
- maxUrlsNo
- public static final java.lang.String makeShorter(java.lang.String string, int maxLength)
string
- maxLength
-
private final void prepareSummary(UrlsQueue visitedUrlsQueue)
visitedUrlsQueue
- private static final void printUrlDetails(java.io.PrintWriter printWriter, int urlNo, WebPageProperties webPageProperties)
URL
details to the given PrintWriter
.
printWriter
- urlNo
- webPageProperties
- public Graph getWebGraph()
public boolean isStopCrawling()
public void stopCrawling()
|
|||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | ||||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |