package hu.procyon.atomizer;

import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.tidy.Tidy;

public class WebScraper {

	public List<Href> getUrlList(URL url) throws IOException {
		System.out.println("Scraping " + url.toString());
		ArrayList<Href> hrefList = new ArrayList<Href>();
		Tidy tidy = new Tidy();
		//tidy.setQuiet(true);
		InputStream streamIn = null;
		streamIn = url.openStream();
		Document xmlDoc = tidy.parseDOM(streamIn, null);
		XPathFactory xPathFactory = XPathFactory.newInstance();
		XPath xPath = xPathFactory.newXPath();
		XPathExpression expression = null;
		try { 
			expression = xPath.compile("//a[@href]");
			Object list = expression.evaluate(xmlDoc, XPathConstants.NODESET);
			NodeList nodes = (NodeList) list;
			for (int i = 0; i < nodes.getLength(); i++) {
				Node node = nodes.item(i);
				Element aElem = (Element) node;
				String anchor = aElem.getTextContent();
				aElem.getAttribute("href");
				
				
				String hrefUrl = node.getNodeValue();
				URL newUrl;
				try {
					newUrl = new URL(url, hrefUrl);
					hrefList.add(newUrl);
					
					
					
					
					
				} catch (MalformedURLException e) {
					e.printStackTrace(System.err);
				}
			}
		} catch (XPathExpressionException e1) {
			e1.printStackTrace(System.err);
		}		
		return hrefList;
	}
	
	public Map<String, List<String>> doHeadRequest(URL url) throws IOException {
		System.out.println("Sending HEAD to " + url.toString());
		HttpURLConnection.setFollowRedirects(false);
		HttpURLConnection conn =  (HttpURLConnection) url.openConnection();
		conn.setRequestMethod("HEAD");
		try {
			conn.connect();
		} catch (SocketTimeoutException ex) { }
		return conn.getHeaderFields();
	}
	
	public List<Href> probeHrefs(List<Href> inHrefs, HrefFilter filter) {
		System.out.println("Probing urls");
		List<Href> output = new ArrayList<Href>();
		for (Href href: inHrefs) {
			if (filter.probeHref(href, this)) {
				output.add(href);
			}
		}
		return output;
	}
}
