package in.ac.iisc.cds.se256.alpha.cc;

import java.io.IOException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.log4j.Logger;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveRecord;

/**
 * Based on by TagCounterMap by Stephen Merity (Smerity)
 * 
 * @author simmhan
 *
 */
public class CCByteCounterMap {
	private static final Logger LOG = Logger.getLogger(CCByteCounterMap.class);
	protected static enum MAPPERCOUNTER {
		RECORDS_IN,
		HTML_IN,
		EXCEPTIONS
	}
	
	/**
	 * Map function that counts the number of bytes of (1) text/html file types, and (2) all file types, that are present in the input 
	 * @author simmhan
	 *
	 */
	protected static class CCByteCounterMapper extends Mapper<Text, ArchiveReader, Text, LongWritable> {
		private Text outKeyHtml = new Text();
		private Text outKeyAny = new Text();
		
		private LongWritable outValHtml = new LongWritable(0);
		private LongWritable outValAny = new LongWritable(0);

		@Override
		public void map(Text key, ArchiveReader value, Context context) throws IOException {
			for (ArchiveRecord r : value) {
				try {
					// We're only interested in processing the responses, not requests or metadata					
					LOG.debug(r.getHeader().getUrl() + " -- " + r.available());

					// process header to get byte count of ANY content
					context.getCounter(MAPPERCOUNTER.RECORDS_IN).increment(1);
					long anyTypeSizeBytes = r.getHeader().getContentLength();
					outKeyAny.set("ANYFILE_SIZE_BYTES");
					outValAny.set(anyTypeSizeBytes);
					context.write(outKeyAny, outValAny);

					// process content to get html content size
					if (r.getHeader().getMimetype().equals("application/http; msgtype=response")) {
						// Convenience function that reads the full message into a raw byte array
						byte[] rawData = IOUtils.toByteArray(r, r.available());
						String content = new String(rawData);
						// The HTTP header gives us valuable information about what was received during the request
						String headerText = content.substring(0, content.indexOf("\r\n\r\n"));
						
						// In our task, we're only interested in text/html, so we can be a little lax
						if (headerText.contains("Content-Type: text/html")) {
							context.getCounter(MAPPERCOUNTER.HTML_IN).increment(1);
							// Only extract the body of the HTTP response when necessary
							// Due to the way strings work in Java, we don't use any more memory than before
							String body = content.substring(content.indexOf("\r\n\r\n") + 4);

							// Get size of html body
							int htmlBodySizeBytes = body.length();
							outKeyHtml.set("HTML_SIZE_BYTES");
							outValHtml.set(htmlBodySizeBytes);
							context.write(outKeyHtml, outValHtml);
						}
					}
				}
				catch (Exception ex) {
					LOG.error("Caught Exception", ex);
					context.getCounter(MAPPERCOUNTER.EXCEPTIONS).increment(1);
				}
			}
		}
	}
}
