From 0d5a34222de4c00be858bde7eaae4163db4bcc3e Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Sat, 3 May 2014 16:43:47 +0200 Subject: CompressableDataWriter: use separate .gz-suffixed file Do not use the same name for compressed and uncompressed files, it is confusing and non-standard. Now it'll use a ".gz"-suffixed file, writing all changes from the uncompressed file if requested. --- src/io/CompressableDataWriter.java | 163 ++++++++++++++++--------------------- 1 file changed, 72 insertions(+), 91 deletions(-) (limited to 'src') diff --git a/src/io/CompressableDataWriter.java b/src/io/CompressableDataWriter.java index d81d80b..84b2150 100644 --- a/src/io/CompressableDataWriter.java +++ b/src/io/CompressableDataWriter.java @@ -7,10 +7,6 @@ import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; -import java.io.OutputStream; -import java.nio.file.Files; -import java.nio.file.StandardCopyOption; -import java.util.logging.Level; import java.util.logging.Logger; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; @@ -24,16 +20,28 @@ import org.apache.commons.io.IOUtils; public class CompressableDataWriter extends DataWriter { /** - * If true, then the output will always be a gzip-compressed stream - * (possibly converting the input). Otherwise, if the input was not - * compressed. then neither will the output be compressed. + * If true, then any uncompressed files will be converted to the compressed + * one, removing the uncompressed original. */ - private final boolean convertUncompressed; + private final boolean convertAndRemoveUncompressed; + /** + * A DataWriter that writes compressed files. Compressed files will get a + * ".gz" extension. If both the compressed and uncompressed files exist and + * conversion is requested, then the contents of the uncompressed file is + * written to the compressed file. After successful conversion, the + * uncompressed file is removed. + * + * @param profilesName The base filename to store user profiles. + * @param tweetsName The base filename to store tweets. + * @param convertAndRemoveUncompressed True if non-empty uncompressed files + * should be converted to compressed ones. + * @throws IOException + */ public CompressableDataWriter(String profilesName, String tweetsName, - boolean convertUncompressed) throws IOException { + boolean convertAndRemoveUncompressed) throws IOException { super(profilesName, tweetsName); - this.convertUncompressed = convertUncompressed; + this.convertAndRemoveUncompressed = convertAndRemoveUncompressed; } @Override @@ -43,103 +51,75 @@ public class CompressableDataWriter extends DataWriter { class CompressedStore extends Store { - private boolean compressable = false; + private static final String FILE_EXT = ".gz"; CompressedStore(String filename) { super(filename); } + private String getFileNameGz() { + String filename = getFileName(); + if (!getFileName().endsWith(FILE_EXT)) { + filename += FILE_EXT; + } + return filename; + } + @Override public void open() throws IOException { - BufferedInputStream bis = null; + // throws FileNotFoundException if the dirs do not exist... + os = new FileOutputStream(getFileNameGz(), true); try { - bis = new BufferedInputStream(new FileInputStream(getFileName())); - // file found containing magic? OK, gzip writable! - compressable = isCompressed(bis); - } catch (FileNotFoundException ex) { - // file not found? Then we are free to write. - compressable = true; - } finally { - IOUtils.closeQuietly(bis); - } - if (!compressable) { - if (CompressableDataWriter.this.convertUncompressed) { - tryConvertToGzip(); - } else { - getLogger().info(getFileName() + ": not compressed and " - + "won't be compressed either."); - } - } - // now prepare a compressed output stream if possible... - os = new FileOutputStream(getFileName(), true); - if (compressable) { os = new GZIPOutputStream(os); - } - } - protected boolean isCompressed(BufferedInputStream bis) { - try { - // file can be opened, check for GZIP magic to see whether it - // is compressed or not - byte[] header = new byte[2]; - if (bis.read(header, 0, header.length) == 2) { - int magic = ((header[1] & 0xFF) << 8) | (header[0] & 0xFF); - return magic == GZIPInputStream.GZIP_MAGIC; + // try to convert uncompressed files if necessary + if (convertAndRemoveUncompressed) { + if (!getFileName().equals(getFileNameGz())) { + doConvertUncompressed(getFileName(), getFileNameGz()); + } } - // file is too small, it is likely empty or contains a single - // newline or other junk. - return true; } catch (IOException ex) { - // file is too small, do not overwrite. + IOUtils.closeQuietly(os); + os = null; + throw ex; } - return false; } - private void tryConvertToGzip() { - // How to convert: - // 1. Rename old file - // 2. Create new compressed file in same dir - // 3. rename new file to old - // 4. if (3) fails, delete old file and rename again - File newTmpFile = null; - FileInputStream fis = null; - OutputStream gzOs = null; + private void doConvertUncompressed(String filename, String filenameGz) + throws IOException { + BufferedInputStream bis = null; try { - File origFile = new File(getFileName()); - long origSize = origFile.length(); - fis = new FileInputStream(origFile); - newTmpFile = File.createTempFile(getFileName(), "origFile", - origFile.getParentFile()); - gzOs = new GZIPOutputStream(new FileOutputStream(newTmpFile)); - - // now compress data, hopefully we have enough time and disk - IOUtils.copy(fis, gzOs); - gzOs.close(); - fis.close(); - long newSize = newTmpFile.length(); - - // compression ready! Don't delete the result! - Files.move(newTmpFile.toPath(), origFile.toPath(), - StandardCopyOption.REPLACE_EXISTING); - newTmpFile = null; - compressable = true; - // show some compression stats - printStats(origSize, newSize); - } catch (IOException ex) { - // failed to convert (disk space too low? Not compressed?) - getLogger().log(Level.INFO, getFileName() + ": Unable to " - + "convert an uncompressed file", ex); - } finally { - IOUtils.closeQuietly(fis); - IOUtils.closeQuietly(gzOs); - if (newTmpFile != null) { - newTmpFile.delete(); + // original, uncompressed file. + File bigFile = new File(filename); + bis = new BufferedInputStream(new FileInputStream(bigFile)); + long origSize = bigFile.length(); + long origSizeGz = new File(filenameGz).length(); + + // start converting the uncompressed file + IOUtils.copy(bis, os); + // flush all data to file to have a more correct size + ((GZIPOutputStream) os).finish(); + os.flush(); + + // done, now try to remove the big blob and print some stats. + IOUtils.closeQuietly(bis); + if (!bigFile.delete()) { + getLogger().info(filename + ": converted file, " + + "but cannot remove the old, uncompressed file."); } + // calculate effective size of newly compressed part + long newSize = new File(filenameGz).length(); + newSize -= origSizeGz; + printStats(filename, origSize, newSize); + } catch (FileNotFoundException ex) { + // file not found? No problem, nothing to convert then. + } finally { + IOUtils.closeQuietly(bis); } } - private void printStats(long origSize, long newSize) { - String msg = "Succesfully converted " + getFileName() + ". "; + private void printStats(String filename, long origSize, long newSize) { + String msg = "Succesfully converted " + filename + ". "; msg += "Uncompressed size: " + origSize + "; " + "compressed size: " + newSize + "."; if (newSize > 0 && origSize > 0) { @@ -158,11 +138,12 @@ public class CompressableDataWriter extends DataWriter { @Override public InputStream getInputStream() throws IOException { - FileInputStream fis = new FileInputStream(getFileName()); - if (compressable) { + FileInputStream fis = new FileInputStream(getFileNameGz()); + try { return new GZIPInputStream(fis); - } else { - return fis; + } catch (IOException ex) { + IOUtils.closeQuietly(fis); + throw ex; } } } -- cgit v1.2.1