/**
 * This code is written specifically to scrape images off of the website for the
 * journal Developmental Dynamics, this could should, in theory work with other
 * journals that are published by Wiley, however for journals coming from other
 * publishers this code cannot be used. The main entry point is the 'scrape'
 * function which will take a URL (to the article (this should be the article
 * from the framed window) and return a ScrapedData image which consists of
 * arrays of strings (the captions) and byte arrays of images.
 * 
 * @author Chris Jarabek (cjjarabe@ucalgary.ca)
 */
package org.xenbase.scraper;

import java.util.ArrayList;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.xenbase.scraper.data.ScrapedData;
import org.xenbase.scraper.data.ScrapedImage;
import org.xenbase.utilities.StringUtil;

public class Scraper_DevDyn extends BasicScraper {

	private static String NORMAL_FIGURE = "nfig";

	private static String LARGE_FIGURE = "mfig";

	private static String ONE_ZERO = "0";

	private static String TWO_ZEROS = "00";

	//	This is what we match to determine the end of caption
	private static String END_OF_CAPTION = "<BR>[<A"; 
	
	//	This is what we match to determine the beginning of a caption
	private static String FIGURE = "Figure ";
	
	public String getRedirURL(String url) throws Exception, Error {
		String retVal = null;
		try {
			url = StringUtil.convertUrl(url);
			HttpClient httpclient = new HttpClient();
			httpclient.getHttpConnectionManager().getParams()
					.setConnectionTimeout(30000);
			//This needs to be set because one of the cookies contains a
			//domain with a period which messes the code up otherwise
			httpclient.getParams().setCookiePolicy(
					CookiePolicy.BROWSER_COMPATIBILITY);
			//Wiley's server will only accept a response if all the cookies
			//are in one string.
			httpclient.getParams().setParameter(
					SINGLE_COOKIE_HEADER, new Boolean(true));
			GetMethod httpget = new GetMethod(url);
			@SuppressWarnings("unused")
			int result = httpclient.executeMethod(httpget);
			retVal = httpget.getURI().toString();
			int t = retVal.indexOf("ABSTRACT");
			if (t == -1)
				t = retVal.indexOf("abstract");
			retVal = retVal.substring(0, t);
			retVal = retVal.replaceAll("journal", "cgi-bin/fulltext");
			httpget.releaseConnection();
		} catch (Exception e) {
			System.out.println("Exception in Scraper_DevDyn.getRedirURL: " + e.getMessage());
			return null;
		} catch (Error e) {
			System.out.println("Error in Scraper_DevDyn.getRedirURL: " + e.getMessage());
			return null;
		}
		if (retVal.indexOf("HTMLSTART") == -1)
			retVal = retVal + "HTMLSTART";

		return (retVal);
	}
	public ScrapedData scrape(String url) throws Exception, Error {
		int imageNumber = 1; //Used in constructing URLs and accessing String
		// and image arrays
		ScrapedData retVal = new ScrapedData(); //Return object
		boolean moreImages = true; //loop controller
		String searchStr = null; //This string is used for searching the body
		// text for captions
		int searchLocationStart = 0;//This int is used for searching the body
		// text for captions
		int searchLocationEnd = 0;//This int is used for searching the body
		// text for captions
		String largeUrl = null; //The url for the large versions of the images
		String normalUrl = null; //The url for the normal versions of the
		// images
		String textUrl = null; //URL to the body of the text
		byte[] b = null; //For images you get
		ArrayList<ScrapedImage> images = new ArrayList<ScrapedImage>();//Holds all the images,
		// will be put into RetVal

		url = url.substring(0, url.indexOf("HTMLSTART"));
		try{
			//Get the images
			while (moreImages) {
				//Build the strings to the URLS
				//This builds strings ending in nfig00x where x = imageNumber
				if (imageNumber < 10) {
					largeUrl = url + LARGE_FIGURE + TWO_ZEROS
							+ Integer.toString(imageNumber);
					normalUrl = url + NORMAL_FIGURE + TWO_ZEROS
							+ Integer.toString(imageNumber);
					//This builds strings ending in nfig0xx where x = imageNumber
				} else if ((imageNumber > 10) && (imageNumber < 100)) {
					largeUrl = url + LARGE_FIGURE + ONE_ZERO
							+ Integer.toString(imageNumber);
					normalUrl = url + NORMAL_FIGURE + ONE_ZERO
							+ Integer.toString(imageNumber);
					//This builds strings ending in nfigxxx where x = imageNumber
				} else {
					largeUrl = url + LARGE_FIGURE + Integer.toString(imageNumber);
					normalUrl = url + NORMAL_FIGURE + Integer.toString(imageNumber);
				}
				//Try to get the large image first (if it exists)
				b = getData(largeUrl);
				ScrapedImage t = null;
				if (b.length != 0) {
					//Put the image in the array
					t = new ScrapedImage();
					t.setByteImg(b);
				}
				//The large image doesn't exist, try to get the normal sized image
				else {
					b = getData(normalUrl);
					if (b.length != 0) {
						t = new ScrapedImage();
						t.setByteImg(b);
					} else
						//If we got null for both the large and the normal image
						//we can assume that there are no more images on the page
						moreImages = false;
	
					//If we were breaking from the loop this will give us the
					// correct number of images for use when getting the
					// captions.
					imageNumber = (imageNumber - 2);
				}
				if (t != null)
				    images.add(t);
				b = null;
				imageNumber++;
			}
	
			//Get captions
			textUrl = url + "main.html";
			b = getData(textUrl);
			String bodyText = new String(b, UTF8);
			//There will only be as many captions as there are images
			for (int i = 1; i <= imageNumber; i++) {
				searchStr = FIGURE + Integer.toString(i) + ".";
				searchLocationStart = bodyText.indexOf(searchStr);
				searchLocationEnd = bodyText.indexOf(END_OF_CAPTION,
						(searchLocationStart + 1));
				String tempString = bodyText.substring(searchLocationStart,
						searchLocationEnd);
				//Get rid of all the markup			
				tempString = parseGreekCharacterImages(tempString);
				tempString = StringUtil.removeDuplicateWhitespace(tempString);
				tempString = StringUtil.stripHTMLTags(tempString);
				tempString = StringUtil.unescapeHTML(tempString);
				ScrapedImage si = (ScrapedImage) images.get((i-1));
				si.setCaption(tempString);
				si.setRefName(FIGURE_TEXT + Integer.toString(i));
				images.set((i-1), si);
				searchLocationStart = 0;
				searchLocationEnd = 0;
			}
	        ScrapedImage[] si = new ScrapedImage[images.size()];
	        images.toArray(si);
	        retVal.setScrapedData(si);
	        retVal.setNumberScraped(images.size());
		}catch(Exception e){
		    System.out.println("Exception in Scraper_DevDyn.scrape: " + e.getMessage());
		    throw e;
		}catch(Error e){
		    System.out.println("Error in Scraper_DevDyn.scrape: " + e.getMessage());
		    throw e;		   
		}		
		return retVal;
	}

	//Because Dev Dyn. doesn't use UTF characters for things like
	//Greek letters, instead they use tiny gifs of the character.  It's worth
	//pointing out that this is really dumb.  The best I can do to 
	//preserve the content is to simply replace the image with the image file name
	//(minus the extension).  The filenames are generally pretty descriptive, 
	//so you'll end up with the text "alpha-helix" instead of using the proper character.
	private String parseGreekCharacterImages(String text) {
		String retVal = "";
		String searchString = "giflibrary";
		int start = 0;
		int end = 0;
		int startWord = 0;
		int endWord = 0;
		if (text.indexOf(searchString) > 0) {
			while (text.indexOf(searchString) > 0) {
				start = text.indexOf(searchString);
				if (start > 0)
				{
					end = text.indexOf(">", start);					
					start = text.lastIndexOf("<", end);
					end++;
				}
				if (end > start) {
					//Parse the file name for the text we'll replace it with
					String temp = text.substring(start, end);
					endWord = temp.indexOf(".gif");
					startWord = temp.lastIndexOf("/", endWord);
					startWord++;
					if (endWord > startWord) {
						String word = temp.substring(startWord, endWord);
						if (word.length() > 0) {
							String firstHalf = text.substring(0, start);
							String secondHalf = text.substring(end, text
									.length());
							text = firstHalf + " " + word + " " + secondHalf;
							start = 0;
							end = 0;
							startWord = 0;
							endWord = 0;
						}
					}
				}
			}
		} 
		retVal = text;
		return retVal;
	}
}
