/**This class is the scraper module for downloading images and captions
 * from the Journal of Cell Biology.
 * 
 * Created: December 4, 2009
 * @author Chris Jarabek (cjjarabe@ucalgary.ca)
 * 
 */

package org.xenbase.scraper;

import java.util.ArrayList;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.xenbase.scraper.data.ScrapedData;
import org.xenbase.scraper.data.ScrapedImage;
import org.xenbase.utilities.StringUtil;

public class Scraper_JCellBio extends BasicScraper {
	
	private static String IMAGEIDENT = "<A HREF=\"/content";
	private static String FIGURE_CAPTION_START = "<BR CLEAR=left>";
	private static String FIGURE_CAPTION_END = "<P>";

    public String getRedirURL(String url) throws Exception, Error {
        String retVal = null;
        try {
        	url = StringUtil.convertUrl(url);			
            HttpClient httpclient = new HttpClient();
            httpclient.getHttpConnectionManager().getParams()
                    .setConnectionTimeout(30000);
            //This needs to be set because one of the cookies contains a
            //domain with a period which messes the code up otherwise
            httpclient.getParams().setCookiePolicy(
                    CookiePolicy.BROWSER_COMPATIBILITY);
            httpclient.getParams().setParameter(
                    SINGLE_COOKIE_HEADER, new Boolean(true));
            GetMethod httpget = new GetMethod(url);
            @SuppressWarnings("unused")
			int result = httpclient.executeMethod(httpget);
            retVal = httpget.getURI().toString();
            httpget.releaseConnection();
        } catch (Exception e) {
            System.out.println("Exception in Scraper_PNAS.getRedirURL: " +e.getMessage());
            throw e;
        } catch (Error e) {
            System.out.println("Error in Scraper_PNAS.getRedirURL: " +e.getMessage());
            throw e;
        }
        return retVal;
    }

	public ScrapedData scrape(String url) throws Exception, Error{

		String baseUrl;
		String caption;
 
		int imageNumber = 1; //Used in constructing URLs for images
		ScrapedData retVal = new ScrapedData(); //return object
		boolean moreImages = true; //loop controller
		int searchStart = 0;
		int searchEnd = 0;
		byte[] b = null; //For images you get
		ArrayList<ScrapedImage> images = new ArrayList<ScrapedImage>(); //Holds all the images
		
		try{
			baseUrl = url.substring(0, (url.indexOf(".org") + 5));
			
			while (moreImages) {
				String fullUrl = url + "/FIG" + imageNumber;
				b = getData(fullUrl);
				String bodyText = new String(b, UTF8); //Got body of main page, now find		
				searchStart = bodyText.indexOf(FIGURE_CAPTION_START);
				if (searchStart < 0)
				{
					moreImages = false;
					break;
				}
				searchEnd = bodyText.indexOf(FIGURE_CAPTION_END, searchStart);
				caption = bodyText.substring(searchStart, searchEnd);
				caption = StringUtil.removeDuplicateWhitespace(caption);
				caption = parseGreekCharacterImages(caption);
				caption = StringUtil.stripHTMLTags(caption);
				caption = StringUtil.unescapeHTML(caption);
				String imgUrl = bodyText.substring((bodyText.indexOf(IMAGEIDENT) + 10), (bodyText.indexOf(".jpeg", bodyText.indexOf(IMAGEIDENT)) + 5));
				imgUrl = baseUrl+imgUrl;
				b = getData(imgUrl);
				ScrapedImage t = new ScrapedImage();
				t.setByteImg(b);
				t.setCaption(caption);
				t.setRefName(FIGURE_TEXT + Integer.toString(imageNumber));
				images.add(t);
				imageNumber++;
			}
			ScrapedImage[] si = new ScrapedImage[images.size()];
			images.toArray(si);
			retVal.setScrapedData(si);
			retVal.setNumberScraped(images.size());
		}catch(Exception e){
		    System.out.println("Exception in Scraper_PNAS.scrape: " +e.getMessage());
		    throw e;
		}catch (Error e){
		    System.out.println("Error in Scraper_PNAS.scrape: " +e.getMessage());
		    throw e;
		}
		return retVal;
	}
	//Because the Journal of Cell Biology doesn't use UTF characters for things like
	//Greek letters, instead they use tiny gifs of the character (same as Dev Dyn.)
	// The best I can do to preserve the content is to replace the image with the 
	//image file name (minus the extension).  The filenames are generally pretty descriptive, 
	//so you'll end up with the text "alpha-helix" instead of using the proper character.
	private String parseGreekCharacterImages(String text) {
		String retVal = "";
		String searchString = "/math/";
		int start = 0;
		int end = 0;
		int startWord = 0;
		int endWord = 0;
		if (text.indexOf(searchString) > 0) {
			while (text.indexOf(searchString) > 0) {
				start = text.indexOf(searchString);
				if (start > 0)
				{
					end = text.indexOf(">", start);					
					start = text.lastIndexOf("<", end);
					end++;
				}
				if (end > start) {
					//Parse the file name for the text we'll replace it with
					String temp = text.substring(start, end);
					endWord = temp.indexOf(".gif");
					startWord = temp.lastIndexOf("/", endWord);
					startWord++;
					if (endWord > startWord) {
						String word = temp.substring(startWord, endWord);
						if (word.length() > 0) {
							String firstHalf = text.substring(0, start);
							String secondHalf = text.substring(end, text
									.length());
							text = firstHalf + " " + word + " " + secondHalf;
							start = 0;
							end = 0;
							startWord = 0;
							endWord = 0;
						}
					}
				}
			}
		} 
		retVal = text;
		return retVal;
	}
}