set mapred.output.compress=true; set hive.exec.compress.output=true; set mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec; set io.compression.codecs=org.apache.hadoop.io.compress.GzipCodec; set hive.base.inputformat=org.apache.hadoop.hive.ql.io.HiveInputFormat; set mapred.min.split.size=134217728; DROP TABLE ngrams; DROP TABLE s3; CREATE EXTERNAL TABLE ngrams ( gram string, year int, occurrences bigint, pages bigint, books bigint ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS SEQUENCEFILE LOCATION 's3://datasets.elasticmapreduce/ngrams/books/20090715/eng-all/3gram/'; CREATE EXTERNAL TABLE s3(ngram string, occurrences bigint) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LOCATION 's3://danielnaber/output/'; INSERT OVERWRITE TABLE s3 select gram, sum(occurrences) from ngrams where year >= 1910 group by gram;