summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Wu <peter@lekensteyn.nl>2014-05-03 16:43:47 +0200
committerPeter Wu <peter@lekensteyn.nl>2014-05-03 16:43:47 +0200
commit0d5a34222de4c00be858bde7eaae4163db4bcc3e (patch)
tree4c6cff13f0e84d251a6cf5e573cbf4c0f10edc05
parent94115ee02948e4b70220025298c812ffb2eee23b (diff)
downloadTwitterDataAnalytics-0d5a34222de4c00be858bde7eaae4163db4bcc3e.tar.gz
CompressableDataWriter: use separate .gz-suffixed file
Do not use the same name for compressed and uncompressed files, it is confusing and non-standard. Now it'll use a ".gz"-suffixed file, writing all changes from the uncompressed file if requested.
-rw-r--r--src/io/CompressableDataWriter.java163
1 files changed, 72 insertions, 91 deletions
diff --git a/src/io/CompressableDataWriter.java b/src/io/CompressableDataWriter.java
index d81d80b..84b2150 100644
--- a/src/io/CompressableDataWriter.java
+++ b/src/io/CompressableDataWriter.java
@@ -7,10 +7,6 @@ import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
-import java.io.OutputStream;
-import java.nio.file.Files;
-import java.nio.file.StandardCopyOption;
-import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
@@ -24,16 +20,28 @@ import org.apache.commons.io.IOUtils;
public class CompressableDataWriter extends DataWriter {
/**
- * If true, then the output will always be a gzip-compressed stream
- * (possibly converting the input). Otherwise, if the input was not
- * compressed. then neither will the output be compressed.
+ * If true, then any uncompressed files will be converted to the compressed
+ * one, removing the uncompressed original.
*/
- private final boolean convertUncompressed;
+ private final boolean convertAndRemoveUncompressed;
+ /**
+ * A DataWriter that writes compressed files. Compressed files will get a
+ * ".gz" extension. If both the compressed and uncompressed files exist and
+ * conversion is requested, then the contents of the uncompressed file is
+ * written to the compressed file. After successful conversion, the
+ * uncompressed file is removed.
+ *
+ * @param profilesName The base filename to store user profiles.
+ * @param tweetsName The base filename to store tweets.
+ * @param convertAndRemoveUncompressed True if non-empty uncompressed files
+ * should be converted to compressed ones.
+ * @throws IOException
+ */
public CompressableDataWriter(String profilesName, String tweetsName,
- boolean convertUncompressed) throws IOException {
+ boolean convertAndRemoveUncompressed) throws IOException {
super(profilesName, tweetsName);
- this.convertUncompressed = convertUncompressed;
+ this.convertAndRemoveUncompressed = convertAndRemoveUncompressed;
}
@Override
@@ -43,103 +51,75 @@ public class CompressableDataWriter extends DataWriter {
class CompressedStore extends Store {
- private boolean compressable = false;
+ private static final String FILE_EXT = ".gz";
CompressedStore(String filename) {
super(filename);
}
+ private String getFileNameGz() {
+ String filename = getFileName();
+ if (!getFileName().endsWith(FILE_EXT)) {
+ filename += FILE_EXT;
+ }
+ return filename;
+ }
+
@Override
public void open() throws IOException {
- BufferedInputStream bis = null;
+ // throws FileNotFoundException if the dirs do not exist...
+ os = new FileOutputStream(getFileNameGz(), true);
try {
- bis = new BufferedInputStream(new FileInputStream(getFileName()));
- // file found containing magic? OK, gzip writable!
- compressable = isCompressed(bis);
- } catch (FileNotFoundException ex) {
- // file not found? Then we are free to write.
- compressable = true;
- } finally {
- IOUtils.closeQuietly(bis);
- }
- if (!compressable) {
- if (CompressableDataWriter.this.convertUncompressed) {
- tryConvertToGzip();
- } else {
- getLogger().info(getFileName() + ": not compressed and "
- + "won't be compressed either.");
- }
- }
- // now prepare a compressed output stream if possible...
- os = new FileOutputStream(getFileName(), true);
- if (compressable) {
os = new GZIPOutputStream(os);
- }
- }
- protected boolean isCompressed(BufferedInputStream bis) {
- try {
- // file can be opened, check for GZIP magic to see whether it
- // is compressed or not
- byte[] header = new byte[2];
- if (bis.read(header, 0, header.length) == 2) {
- int magic = ((header[1] & 0xFF) << 8) | (header[0] & 0xFF);
- return magic == GZIPInputStream.GZIP_MAGIC;
+ // try to convert uncompressed files if necessary
+ if (convertAndRemoveUncompressed) {
+ if (!getFileName().equals(getFileNameGz())) {
+ doConvertUncompressed(getFileName(), getFileNameGz());
+ }
}
- // file is too small, it is likely empty or contains a single
- // newline or other junk.
- return true;
} catch (IOException ex) {
- // file is too small, do not overwrite.
+ IOUtils.closeQuietly(os);
+ os = null;
+ throw ex;
}
- return false;
}
- private void tryConvertToGzip() {
- // How to convert:
- // 1. Rename old file
- // 2. Create new compressed file in same dir
- // 3. rename new file to old
- // 4. if (3) fails, delete old file and rename again
- File newTmpFile = null;
- FileInputStream fis = null;
- OutputStream gzOs = null;
+ private void doConvertUncompressed(String filename, String filenameGz)
+ throws IOException {
+ BufferedInputStream bis = null;
try {
- File origFile = new File(getFileName());
- long origSize = origFile.length();
- fis = new FileInputStream(origFile);
- newTmpFile = File.createTempFile(getFileName(), "origFile",
- origFile.getParentFile());
- gzOs = new GZIPOutputStream(new FileOutputStream(newTmpFile));
-
- // now compress data, hopefully we have enough time and disk
- IOUtils.copy(fis, gzOs);
- gzOs.close();
- fis.close();
- long newSize = newTmpFile.length();
-
- // compression ready! Don't delete the result!
- Files.move(newTmpFile.toPath(), origFile.toPath(),
- StandardCopyOption.REPLACE_EXISTING);
- newTmpFile = null;
- compressable = true;
- // show some compression stats
- printStats(origSize, newSize);
- } catch (IOException ex) {
- // failed to convert (disk space too low? Not compressed?)
- getLogger().log(Level.INFO, getFileName() + ": Unable to "
- + "convert an uncompressed file", ex);
- } finally {
- IOUtils.closeQuietly(fis);
- IOUtils.closeQuietly(gzOs);
- if (newTmpFile != null) {
- newTmpFile.delete();
+ // original, uncompressed file.
+ File bigFile = new File(filename);
+ bis = new BufferedInputStream(new FileInputStream(bigFile));
+ long origSize = bigFile.length();
+ long origSizeGz = new File(filenameGz).length();
+
+ // start converting the uncompressed file
+ IOUtils.copy(bis, os);
+ // flush all data to file to have a more correct size
+ ((GZIPOutputStream) os).finish();
+ os.flush();
+
+ // done, now try to remove the big blob and print some stats.
+ IOUtils.closeQuietly(bis);
+ if (!bigFile.delete()) {
+ getLogger().info(filename + ": converted file, "
+ + "but cannot remove the old, uncompressed file.");
}
+ // calculate effective size of newly compressed part
+ long newSize = new File(filenameGz).length();
+ newSize -= origSizeGz;
+ printStats(filename, origSize, newSize);
+ } catch (FileNotFoundException ex) {
+ // file not found? No problem, nothing to convert then.
+ } finally {
+ IOUtils.closeQuietly(bis);
}
}
- private void printStats(long origSize, long newSize) {
- String msg = "Succesfully converted " + getFileName() + ". ";
+ private void printStats(String filename, long origSize, long newSize) {
+ String msg = "Succesfully converted " + filename + ". ";
msg += "Uncompressed size: " + origSize + "; "
+ "compressed size: " + newSize + ".";
if (newSize > 0 && origSize > 0) {
@@ -158,11 +138,12 @@ public class CompressableDataWriter extends DataWriter {
@Override
public InputStream getInputStream() throws IOException {
- FileInputStream fis = new FileInputStream(getFileName());
- if (compressable) {
+ FileInputStream fis = new FileInputStream(getFileNameGz());
+ try {
return new GZIPInputStream(fis);
- } else {
- return fis;
+ } catch (IOException ex) {
+ IOUtils.closeQuietly(fis);
+ throw ex;
}
}
}