/**This class is the scraper module for downloading images and captions
 * from the journals Current Biology, Developmental Cell, and Cell.
 * @author Chris Jarabek (cjjarabe@ucalgary.ca)
 * 
 */

package org.xenbase.scraper;

import java.util.ArrayList;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.xenbase.scraper.data.ScrapedData;
import org.xenbase.scraper.data.ScrapedImage;
import org.xenbase.utilities.StringUtil;

public class Scraper_CurrBio_DevCell_Cell extends BasicScraper {
    private static String SCIDIR = "sciencedirect.com/science";

    private static String BIGFIGURE = "img src";

    private static String MARKER = "image=fig";

    private static String SCIENCE = "science?";

    public String getRedirURL(String url) throws Exception, Error {
        String retVal = null;
        int start = 0;
        int startUrl = 0;
        int endUrl = 0;
        byte[] b = null;

        try {
            url = StringUtil.convertUrl(url);
            HttpClient httpclient = new HttpClient();
            httpclient.getHttpConnectionManager().getParams().setConnectionTimeout(30000);
            httpclient.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
            httpclient.getParams().setParameter(SINGLE_COOKIE_HEADER, new Boolean(true));
            GetMethod httpget = new GetMethod(url);
            @SuppressWarnings("unused")
			int result = httpclient.executeMethod(httpget);
            retVal = httpget.getURI().toString();
            httpget.releaseConnection();
            b = getData(url);
            String bodyText = new String(b);
            start = bodyText.indexOf(SCIDIR);
            endUrl = bodyText.indexOf("');", start);
            bodyText = bodyText.substring(0, endUrl);
            startUrl = bodyText.lastIndexOf("http");
            bodyText = bodyText.substring(startUrl, bodyText.length());
            bodyText = bodyText.replaceAll("&amp;", "&");
            retVal = bodyText;
        } catch (Exception e) {
            System.out.println("Exception in Scraper_CurrBio_DevCell_Cell.getRedirURL: " + e.getMessage());
        } catch (Error e) {
            System.out.println("Error in Scraper_CurrBio_DevCell_Cell.getRedirURL: " + e.getMessage());
        }
        return retVal;
    }

    public ScrapedData scrape(String url) throws Exception, Error {
        String baseUrl;
        String fullUrl;
        String imageUrl;
        String caption;

        int imageNumber = 1; //Used in constructing URLs for images
        ScrapedData retVal = new ScrapedData(); //return object
        boolean moreImages = true; //loop controller
        int searchStart = 0;
        int searchEnd = 0;
        byte[] b = null; //For images you get
        ArrayList<ScrapedImage> images = new ArrayList<ScrapedImage>();
        try {
            searchStart = url.indexOf(SCIENCE);
            baseUrl = url.substring(0, searchStart);
            b = getData(url);
            String bodyText = new String(b, UTF8); //Got body of main page, now find
            // image pages

            while (moreImages) {

                searchStart = bodyText.indexOf((MARKER + imageNumber));
                if (searchStart < 0)
                    moreImages = false;
                searchEnd = bodyText.indexOf(QUOTE, searchStart);
                String parsedText = bodyText.substring(0, searchEnd);
                parsedText = parsedText.substring((parsedText.lastIndexOf(QUOTE) + 2), parsedText.length());
                fullUrl = baseUrl + parsedText;
                b = getData(fullUrl);
                String imagePage = new String(b, UTF8);

                searchStart = imagePage.indexOf(BIGFIGURE);
                searchStart = searchStart + 9;
                searchEnd = imagePage.indexOf(QUOTE, (searchStart));
                imageUrl = imagePage.substring((searchStart), searchEnd);
                if (imageUrl.indexOf("sciencedirect") == -1)
                    imageUrl = (baseUrl + imageUrl);
                if (imageUrl.indexOf("<head>") > 0) {
                    searchStart = imagePage.indexOf("cache");
                    searchEnd = imagePage.indexOf(QUOTE, searchStart);
                    imageUrl = imagePage.substring(0, searchEnd);
                    searchStart = (imageUrl.lastIndexOf(QUOTE) + 1);
                    imageUrl = imageUrl.substring(searchStart, imageUrl.length());
                }
                b = getData(imageUrl);
                ScrapedImage t = new ScrapedImage();
                t.setByteImg(b);
                searchStart = imagePage.indexOf(FIGURE_TEXT);
                searchEnd = imagePage.indexOf("<br>", searchStart);
                caption = imagePage.substring(0, searchEnd);
                caption = caption.substring(searchStart, caption.length());
                caption = StringUtil.stripHTMLTags(caption);
                caption = StringUtil.unescapeHTML(caption);
                t.setCaption(caption);
                t.setRefName(FIGURE_TEXT + Integer.toString(imageNumber));
                images.add(t);
                b = null;
                imageNumber++;
                searchStart = bodyText.indexOf((MARKER + imageNumber));
                if (searchStart < 0)
                    moreImages = false;

            }
            //Convert the ArrayList to a proper array
            ScrapedImage[] si = new ScrapedImage[images.size()];
            images.toArray(si);
            retVal.setScrapedData(si);
            retVal.setNumberScraped(images.size());
        } catch (Exception e) {
            System.out.println("Exception in Scraper_CurrBio_DevCell_Cell.scrape: " + e.getMessage());
            throw e;
        } catch (Error e) {
            System.out.println("Error in Scraper_CurrBio_DevCell_Cell.scrape: " + e.getMessage());
            throw e;
        }
        return retVal;
    }
}