/**This class is the scraper module for downloading images and captions
 * from the journal Proceedings of the National Academy of Sciences (PNAS).
 * 
 * Created: March 4, 2008
 * Modified: December 2, 2009
 * @author Chris Jarabek (cjjarabe@ucalgary.ca)
 * 
 */

package org.xenbase.scraper;

import java.util.ArrayList;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.xenbase.scraper.data.ScrapedData;
import org.xenbase.scraper.data.ScrapedImage;
import org.xenbase.utilities.StringUtil;

public class Scraper_PNAS extends BasicScraper {
    private String BASE = ".long";

    private String FIGURE = ".large.jpg";
	
    private String FIGURE_CAPTION_START= "<div class=\"fig-caption\">";
	
    private String FIGURE_CAPTION_END = "</div>";
	

    public String getRedirURL(String url) throws Exception, Error {
        String retVal = null;
        try {
        	url = StringUtil.convertUrl(url);			
            HttpClient httpclient = new HttpClient();
            httpclient.getHttpConnectionManager().getParams()
                    .setConnectionTimeout(30000);
            //This needs to be set because one of the cookies contains a
            //domain with a period which messes the code up otherwise
            httpclient.getParams().setCookiePolicy(
                    CookiePolicy.BROWSER_COMPATIBILITY);
            httpclient.getParams().setParameter(
                    SINGLE_COOKIE_HEADER, new Boolean(true));
            GetMethod httpget = new GetMethod(url);
            @SuppressWarnings("unused")
			int result = httpclient.executeMethod(httpget);
            retVal = httpget.getURI().toString();
            httpget.releaseConnection();
        } catch (Exception e) {
            System.out.println("Exception in Scraper_PNAS.getRedirURL: " +e.getMessage());
            throw e;
        } catch (Error e) {
            System.out.println("Error in Scraper_PNAS.getRedirURL: " +e.getMessage());
            throw e;
        }
        return retVal;
    }

	public ScrapedData scrape(String url) throws Exception, Error{

		String baseUrl;
		String fullUrl;
		String caption;
 
		int imageNumber = 1; //Used in constructing URLs for images
		ScrapedData retVal = new ScrapedData(); //return object
		boolean moreImages = true; //loop controller
		int searchStart = 0;
		int searchEnd = 0;
		byte[] b = null; //For images you get
		ArrayList<ScrapedImage> images = new ArrayList<ScrapedImage>(); //Holds all the images
		
		try{
			searchStart = url.indexOf(BASE);
			baseUrl = url.substring(0, searchStart);
			baseUrl = baseUrl + "/";
			b = getData(url);
			String bodyText = new String(b, UTF8); //Got body of main page, now find
	
			while (moreImages) {
	
				searchStart = bodyText.indexOf((FIGURE_CAPTION_START));
				if (searchStart < 0)
				{
					moreImages = false;
					break;
				}
				searchEnd = bodyText.indexOf(FIGURE_CAPTION_END, searchStart);
				caption = bodyText.substring(searchStart, searchEnd);
				caption = StringUtil.removeDuplicateWhitespace(caption);
				caption = StringUtil.stripHTMLTags(caption);
				bodyText = bodyText.substring(searchEnd, bodyText.length());			
				fullUrl = baseUrl + "F" + imageNumber + FIGURE;			
				b = getData(fullUrl);
				ScrapedImage t = new ScrapedImage();
				t.setByteImg(b);
				t.setCaption(caption);
				t.setRefName(FIGURE_TEXT + Integer.toString(imageNumber));
				images.add(t);
				imageNumber++;
			}
			ScrapedImage[] si = new ScrapedImage[images.size()];
			images.toArray(si);
			retVal.setScrapedData(si);
			retVal.setNumberScraped(images.size());
		}catch(Exception e){
		    System.out.println("Exception in Scraper_PNAS.scrape: " +e.getMessage());
		    throw e;
		}catch (Error e){
		    System.out.println("Error in Scraper_PNAS.scrape: " +e.getMessage());
		    throw e;
		}
		return retVal;
	}
}