/**This class is the scraper module for downloading images and captions
 * from the journal Development.
 * 
 * @author Chris Jarabek (cjjarabe@ucalgary.ca)
 */
package org.xenbase.scraper;

import java.util.ArrayList;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.xenbase.scraper.data.ScrapedData;
import org.xenbase.scraper.data.ScrapedImage;
import org.xenbase.utilities.StringUtil;

public class Scraper_Development extends BasicScraper {
	private static String MARKER = "images/large";

	private static String FIG = "FIG";

	private static String FIGCAP = "Fig. ";

	private static String ARTICLE = "cgi/content/full";

	private static String REQUEST = "page you requested";

	public String getRedirURL(String url) throws Exception, Error{
		String retVal = null;
		try {
			url = StringUtil.convertUrl(url);			
			HttpClient httpclient = new HttpClient();
			httpclient.getHttpConnectionManager().getParams()
					.setConnectionTimeout(30000);
			//This needs to be set because one of the cookies contains a
			//domain with a period which messes the code up otherwise
			httpclient.getParams().setCookiePolicy(
					CookiePolicy.BROWSER_COMPATIBILITY);
			httpclient.getParams().setParameter(
					SINGLE_COOKIE_HEADER, new Boolean(true));
			GetMethod httpget = new GetMethod(url);
			@SuppressWarnings("unused")
			int result = httpclient.executeMethod(httpget);
			retVal = httpget.getURI().toString();
			//Sometimes the URL comes back and it's messed up, this fixes it
			if (retVal.indexOf("reprint") > 0) 
			{
				String t1 = retVal.substring(0, retVal.indexOf("reprint"));
				String t2 = retVal.substring(retVal.indexOf("reprint") + 7,
						retVal.length());
				retVal = t1 + "content/full" + t2;
			}
			httpget.releaseConnection();
		} catch (Exception e) {
			System.out.println("Exception in Scraper_Development.getRedirURL: " + e.getMessage());
			throw e;
		} catch (Error e) {
			System.out.println("Error in Scraper_Development.getRedirURL: " + e.getMessage());
			throw e;
		}
		return retVal;
	}

	public ScrapedData scrape(String url) throws Exception, Error {
		String baseUrl;
		String fullUrl;
		String imageUrl;
		String urlNumbers;
		String searchString;
		String checkContent;

		int imageNumber = 1; //Used in constructing URLs for images
		ScrapedData retVal = new ScrapedData(); //return object
		boolean moreImages = true; //loop controller
		int searchStart = 0;
		int searchEnd = 0;
		byte[] b = null; //For images you get
		
		ArrayList<ScrapedImage> images = new ArrayList<ScrapedImage>();
        try{
			if (url.indexOf("reprint") > 0) 
			{
				String t1 = url.substring(0, url.indexOf("reprint"));
				String t2 = url.substring(url.indexOf("reprint") + 7, url.length());
				url = t1 + "content/full" + t2;
			}
			searchStart = url.indexOf("cgi");
			baseUrl = url.substring(0, searchStart);
			urlNumbers = url.substring((url.indexOf("full") + 4), url.length());
	
			while (moreImages) {
	
				//Get the first image URL, then build all other URLs from that
				fullUrl = baseUrl + ARTICLE + urlNumbers + "/" + FIG
						+ Integer.toString(imageNumber);
				b = getData(fullUrl);
				String bodyText = new String(b, UTF8);
				if (bodyText.indexOf("Page Not Found") > 0)
				{
				    fullUrl = baseUrl + ARTICLE + urlNumbers + "/F"	+ Integer.toString(imageNumber);
				    b = getData(fullUrl);
				    bodyText = new String(b, UTF8);
				    if (bodyText.indexOf("Page Not Found") > 0)
				        break;
				}			   
				searchStart = bodyText.indexOf(MARKER);
				searchEnd = bodyText.indexOf(QUOTE, searchStart);
				String parsedText = bodyText.substring(0, searchEnd);
				
				parsedText = parsedText.substring((parsedText.lastIndexOf(QUOTE) + 2), parsedText.length());
				imageUrl = baseUrl + parsedText;
				
				if (imageNumber < 10)
					fullUrl = fullUrl.substring(0, (fullUrl.length() - 1));
				else
					fullUrl = fullUrl.substring(0, (fullUrl.length() - 2));
	
				
				String tempUrl = imageUrl;
				b = getData(tempUrl);
				ScrapedImage t = null;
				if (b.length != 0) {
					checkContent = new String(b);
					if (checkContent.indexOf(REQUEST) > 0)
						moreImages = false;
					else {
						t = new ScrapedImage();
						t.setByteImg(b);
					}
				} else {
					moreImages = false;
				}
				b = null;
				if (moreImages) {
					tempUrl = fullUrl + Integer.toString(imageNumber);
					b = getData(tempUrl);
					bodyText = "";
					bodyText = new String(b, UTF8);
					searchString = FIGCAP;
					searchStart = bodyText.indexOf(searchString);
					if (searchStart > 0) {
						bodyText = bodyText.substring(searchStart);
						searchEnd = bodyText.indexOf("<P>");
						bodyText = bodyText.substring(0, searchEnd);
						bodyText = StringUtil.stripHTMLTags(bodyText);
						bodyText = StringUtil.unescapeHTML(bodyText); 
						bodyText = bodyText.replaceAll("\n", " ");
						t.setCaption(bodyText);
					} else
						t.setCaption("Malformed caption detected.");
	
					t.setRefName(FIGURE_TEXT + Integer.toString(imageNumber));
					searchStart = 0;
					searchEnd = 0;
					images.add(t);
					imageNumber++;
				}
			}
			ScrapedImage[] si = new ScrapedImage[images.size()];
			images.toArray(si);
			retVal.setScrapedData(si);
			retVal.setNumberScraped(images.size());
		}catch(Exception e){
		    System.out.println("Exception in Scraper_Development.scrape: " + e.getMessage());
		    throw e;
		}catch(Error e){
		    System.out.println("Error in Scraper_Development.scrape: " + e.getMessage());
		    throw e;
		}
		return retVal;
	}

}