linux/fs/btrfs/zlib.c
Daniel Vacek da798fa519 btrfs: zstd: enable negative compression levels mount option
Allow using the fast modes (negative compression levels) of zstd as a
mount option.

As per the results, the compression ratio is (expectedly) lower:

for level in {-15..-1} 1 2 3; \
do printf "level %3d\n" $level; \
  mount -o compress=zstd:$level /dev/sdb /mnt/test/; \
  grep sdb /proc/mounts; \
  cp -r /usr/bin       /mnt/test/; sync; compsize /mnt/test/bin; \
  cp -r /usr/share/doc /mnt/test/; sync; compsize /mnt/test/doc; \
  cp    enwik9         /mnt/test/; sync; compsize /mnt/test/enwik9; \
  cp    linux-6.13.tar /mnt/test/; sync; compsize /mnt/test/linux-6.13.tar; \
  rm -r /mnt/test/{bin,doc,enwik9,linux-6.13.tar}; \
  umount /mnt/test/; \
done |& tee results | \
awk '/^level/{print}/^TOTAL/{print$3"\t"$2"  |"}' | paste - - - - -

		266M	bin  |	45M	doc  |	953M	wiki |	1.4G	source
=============================+===============+===============+===============+
level -15	180M	67%  |	30M	68%  |	694M	72%  |	598M	40%  |
level -14	180M	67%  |	30M	67%  |	683M	71%  |	581M	39%  |
level -13	177M	66%  |	29M	66%  |	671M	70%  |	566M	38%  |
level -12	174M	65%  |	29M	65%  |	658M	69%  |	548M	37%  |
level -11	174M	65%  |	28M	64%  |	645M	67%  |	530M	35%  |
level -10	171M	64%  |	28M	62%  |	631M	66%  |	512M	34%  |
level  -9	165M	62%  |	27M	61%  |	615M	64%  |	493M	33%  |
level  -8	161M	60%  |	27M	59%  |	598M	62%  |	475M	32%  |
level  -7	155M	58%  |	26M	58%  |	582M	61%  |	457M	30%  |
level  -6	151M	56%  |	25M	56%  |	565M	59%  |	437M	29%  |
level  -5	145M	54%  |	24M	55%  |	545M	57%  |	417M	28%  |
level  -4	139M	52%  |	23M	52%  |	520M	54%  |	391M	26%  |
level  -3	135M	50%  |	22M	50%  |	495M	51%  |	369M	24%  |
level  -2	127M	47%  |	22M	48%  |	470M	49%  |	349M	23%  |
level  -1	120M	45%  |	21M	47%  |	452M	47%  |	332M	22%  |
level   1	110M	41%  |	17M	39%  |	362M	38%  |	290M	19%  |
level   2	106M	40%  |	17M	38%  |	349M	36%  |	288M	19%  |
level   3	104M	39%  |	16M	37%  |	340M	35%  |	276M	18%  |

The samples represent some data sets that can be commonly found and show
approximate compressibility. The fast levels trade off speed for ratio
and are best suitable for highly compressible data.

As can be seen above, comparing the results to the current default zstd
level 3, the negative levels are roughly 2x worse at -15 and the
ratio increases almost linearly with each level.

Signed-off-by: Daniel Vacek <neelx@suse.com>
[ update changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
2025-03-18 20:35:41 +01:00

495 lines
13 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2008 Oracle. All rights reserved.
*
* Based on jffs2 zlib code:
* Copyright © 2001-2007 Red Hat, Inc.
* Created by David Woodhouse <dwmw2@infradead.org>
*/
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/zlib.h>
#include <linux/zutil.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/pagemap.h>
#include <linux/bio.h>
#include <linux/refcount.h>
#include "btrfs_inode.h"
#include "compression.h"
#include "fs.h"
#include "subpage.h"
/* workspace buffer size for s390 zlib hardware support */
#define ZLIB_DFLTCC_BUF_SIZE (4 * PAGE_SIZE)
struct workspace {
z_stream strm;
char *buf;
unsigned int buf_size;
struct list_head list;
int level;
};
static struct workspace_manager wsm;
struct list_head *zlib_get_workspace(unsigned int level)
{
struct list_head *ws = btrfs_get_workspace(BTRFS_COMPRESS_ZLIB, level);
struct workspace *workspace = list_entry(ws, struct workspace, list);
workspace->level = level;
return ws;
}
void zlib_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
kvfree(workspace->strm.workspace);
kfree(workspace->buf);
kfree(workspace);
}
struct list_head *zlib_alloc_workspace(unsigned int level)
{
struct workspace *workspace;
int workspacesize;
workspace = kzalloc(sizeof(*workspace), GFP_KERNEL);
if (!workspace)
return ERR_PTR(-ENOMEM);
workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
zlib_inflate_workspacesize());
workspace->strm.workspace = kvzalloc(workspacesize, GFP_KERNEL | __GFP_NOWARN);
workspace->level = level;
workspace->buf = NULL;
/*
* In case of s390 zlib hardware support, allocate lager workspace
* buffer. If allocator fails, fall back to a single page buffer.
*/
if (zlib_deflate_dfltcc_enabled()) {
workspace->buf = kmalloc(ZLIB_DFLTCC_BUF_SIZE,
__GFP_NOMEMALLOC | __GFP_NORETRY |
__GFP_NOWARN | GFP_NOIO);
workspace->buf_size = ZLIB_DFLTCC_BUF_SIZE;
}
if (!workspace->buf) {
workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
workspace->buf_size = PAGE_SIZE;
}
if (!workspace->strm.workspace || !workspace->buf)
goto fail;
INIT_LIST_HEAD(&workspace->list);
return &workspace->list;
fail:
zlib_free_workspace(&workspace->list);
return ERR_PTR(-ENOMEM);
}
/*
* Helper for S390x with hardware zlib compression support.
*
* That hardware acceleration requires a buffer size larger than a single page
* to get ideal performance, thus we need to do the memory copy rather than
* use the page cache directly as input buffer.
*/
static int copy_data_into_buffer(struct address_space *mapping,
struct workspace *workspace, u64 filepos,
unsigned long length)
{
u64 cur = filepos;
/* It's only for hardware accelerated zlib code. */
ASSERT(zlib_deflate_dfltcc_enabled());
while (cur < filepos + length) {
struct folio *folio;
void *data_in;
unsigned int offset;
unsigned long copy_length;
int ret;
ret = btrfs_compress_filemap_get_folio(mapping, cur, &folio);
if (ret < 0)
return ret;
/* No large folio support yet. */
ASSERT(!folio_test_large(folio));
offset = offset_in_folio(folio, cur);
copy_length = min(folio_size(folio) - offset,
filepos + length - cur);
data_in = kmap_local_folio(folio, offset);
memcpy(workspace->buf + cur - filepos, data_in, copy_length);
kunmap_local(data_in);
cur += copy_length;
}
return 0;
}
int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret;
char *data_in = NULL;
char *cfolio_out;
int nr_folios = 0;
struct folio *in_folio = NULL;
struct folio *out_folio = NULL;
unsigned long len = *total_out;
unsigned long nr_dest_folios = *out_folios;
const unsigned long max_out = nr_dest_folios * PAGE_SIZE;
const u64 orig_end = start + len;
*out_folios = 0;
*total_out = 0;
*total_in = 0;
ret = zlib_deflateInit(&workspace->strm, workspace->level);
if (unlikely(ret != Z_OK)) {
struct btrfs_inode *inode = BTRFS_I(mapping->host);
btrfs_err(inode->root->fs_info,
"zlib compression init failed, error %d root %llu inode %llu offset %llu",
ret, btrfs_root_id(inode->root), btrfs_ino(inode), start);
ret = -EIO;
goto out;
}
workspace->strm.total_in = 0;
workspace->strm.total_out = 0;
out_folio = btrfs_alloc_compr_folio();
if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
}
cfolio_out = folio_address(out_folio);
folios[0] = out_folio;
nr_folios = 1;
workspace->strm.next_in = workspace->buf;
workspace->strm.avail_in = 0;
workspace->strm.next_out = cfolio_out;
workspace->strm.avail_out = PAGE_SIZE;
while (workspace->strm.total_in < len) {
/*
* Get next input pages and copy the contents to
* the workspace buffer if required.
*/
if (workspace->strm.avail_in == 0) {
unsigned long bytes_left = len - workspace->strm.total_in;
unsigned int copy_length = min(bytes_left, workspace->buf_size);
/*
* This can only happen when hardware zlib compression is
* enabled.
*/
if (copy_length > PAGE_SIZE) {
ret = copy_data_into_buffer(mapping, workspace,
start, copy_length);
if (ret < 0)
goto out;
start += copy_length;
workspace->strm.next_in = workspace->buf;
workspace->strm.avail_in = copy_length;
} else {
unsigned int pg_off;
unsigned int cur_len;
if (data_in) {
kunmap_local(data_in);
folio_put(in_folio);
data_in = NULL;
}
ret = btrfs_compress_filemap_get_folio(mapping,
start, &in_folio);
if (ret < 0)
goto out;
pg_off = offset_in_page(start);
cur_len = btrfs_calc_input_length(orig_end, start);
data_in = kmap_local_folio(in_folio, pg_off);
start += cur_len;
workspace->strm.next_in = data_in;
workspace->strm.avail_in = cur_len;
}
}
ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH);
if (unlikely(ret != Z_OK)) {
struct btrfs_inode *inode = BTRFS_I(mapping->host);
btrfs_warn(inode->root->fs_info,
"zlib compression failed, error %d root %llu inode %llu offset %llu",
ret, btrfs_root_id(inode->root), btrfs_ino(inode),
start);
zlib_deflateEnd(&workspace->strm);
ret = -EIO;
goto out;
}
/* we're making it bigger, give up */
if (workspace->strm.total_in > 8192 &&
workspace->strm.total_in <
workspace->strm.total_out) {
ret = -E2BIG;
goto out;
}
/* we need another page for writing out. Test this
* before the total_in so we will pull in a new page for
* the stream end if required
*/
if (workspace->strm.avail_out == 0) {
if (nr_folios == nr_dest_folios) {
ret = -E2BIG;
goto out;
}
out_folio = btrfs_alloc_compr_folio();
if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
}
cfolio_out = folio_address(out_folio);
folios[nr_folios] = out_folio;
nr_folios++;
workspace->strm.avail_out = PAGE_SIZE;
workspace->strm.next_out = cfolio_out;
}
/* we're all done */
if (workspace->strm.total_in >= len)
break;
if (workspace->strm.total_out > max_out)
break;
}
workspace->strm.avail_in = 0;
/*
* Call deflate with Z_FINISH flush parameter providing more output
* space but no more input data, until it returns with Z_STREAM_END.
*/
while (ret != Z_STREAM_END) {
ret = zlib_deflate(&workspace->strm, Z_FINISH);
if (ret == Z_STREAM_END)
break;
if (ret != Z_OK && ret != Z_BUF_ERROR) {
zlib_deflateEnd(&workspace->strm);
ret = -EIO;
goto out;
} else if (workspace->strm.avail_out == 0) {
/* Get another folio for the stream end. */
if (nr_folios == nr_dest_folios) {
ret = -E2BIG;
goto out;
}
out_folio = btrfs_alloc_compr_folio();
if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
}
cfolio_out = folio_address(out_folio);
folios[nr_folios] = out_folio;
nr_folios++;
workspace->strm.avail_out = PAGE_SIZE;
workspace->strm.next_out = cfolio_out;
}
}
zlib_deflateEnd(&workspace->strm);
if (workspace->strm.total_out >= workspace->strm.total_in) {
ret = -E2BIG;
goto out;
}
ret = 0;
*total_out = workspace->strm.total_out;
*total_in = workspace->strm.total_in;
out:
*out_folios = nr_folios;
if (data_in) {
kunmap_local(data_in);
folio_put(in_folio);
}
return ret;
}
int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0, ret2;
int wbits = MAX_WBITS;
char *data_in;
size_t total_out = 0;
unsigned long folio_in_index = 0;
size_t srclen = cb->compressed_len;
unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
unsigned long buf_start;
struct folio **folios_in = cb->compressed_folios;
data_in = kmap_local_folio(folios_in[folio_in_index], 0);
workspace->strm.next_in = data_in;
workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE);
workspace->strm.total_in = 0;
workspace->strm.total_out = 0;
workspace->strm.next_out = workspace->buf;
workspace->strm.avail_out = workspace->buf_size;
/* If it's deflate, and it's got no preset dictionary, then
we can tell zlib to skip the adler32 check. */
if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
((data_in[0] & 0x0f) == Z_DEFLATED) &&
!(((data_in[0]<<8) + data_in[1]) % 31)) {
wbits = -((data_in[0] >> 4) + 8);
workspace->strm.next_in += 2;
workspace->strm.avail_in -= 2;
}
ret = zlib_inflateInit2(&workspace->strm, wbits);
if (unlikely(ret != Z_OK)) {
struct btrfs_inode *inode = cb->bbio.inode;
kunmap_local(data_in);
btrfs_err(inode->root->fs_info,
"zlib decompression init failed, error %d root %llu inode %llu offset %llu",
ret, btrfs_root_id(inode->root), btrfs_ino(inode), cb->start);
return -EIO;
}
while (workspace->strm.total_in < srclen) {
ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
if (ret != Z_OK && ret != Z_STREAM_END)
break;
buf_start = total_out;
total_out = workspace->strm.total_out;
/* we didn't make progress in this inflate call, we're done */
if (buf_start == total_out)
break;
ret2 = btrfs_decompress_buf2page(workspace->buf,
total_out - buf_start, cb, buf_start);
if (ret2 == 0) {
ret = 0;
goto done;
}
workspace->strm.next_out = workspace->buf;
workspace->strm.avail_out = workspace->buf_size;
if (workspace->strm.avail_in == 0) {
unsigned long tmp;
kunmap_local(data_in);
folio_in_index++;
if (folio_in_index >= total_folios_in) {
data_in = NULL;
break;
}
data_in = kmap_local_folio(folios_in[folio_in_index], 0);
workspace->strm.next_in = data_in;
tmp = srclen - workspace->strm.total_in;
workspace->strm.avail_in = min(tmp, PAGE_SIZE);
}
}
if (unlikely(ret != Z_STREAM_END)) {
btrfs_err(cb->bbio.inode->root->fs_info,
"zlib decompression failed, error %d root %llu inode %llu offset %llu",
ret, btrfs_root_id(cb->bbio.inode->root),
btrfs_ino(cb->bbio.inode), cb->start);
ret = -EIO;
} else {
ret = 0;
}
done:
zlib_inflateEnd(&workspace->strm);
if (data_in)
kunmap_local(data_in);
return ret;
}
int zlib_decompress(struct list_head *ws, const u8 *data_in,
struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0;
int wbits = MAX_WBITS;
unsigned long to_copy;
workspace->strm.next_in = data_in;
workspace->strm.avail_in = srclen;
workspace->strm.total_in = 0;
workspace->strm.next_out = workspace->buf;
workspace->strm.avail_out = workspace->buf_size;
workspace->strm.total_out = 0;
/* If it's deflate, and it's got no preset dictionary, then
we can tell zlib to skip the adler32 check. */
if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
((data_in[0] & 0x0f) == Z_DEFLATED) &&
!(((data_in[0]<<8) + data_in[1]) % 31)) {
wbits = -((data_in[0] >> 4) + 8);
workspace->strm.next_in += 2;
workspace->strm.avail_in -= 2;
}
ret = zlib_inflateInit2(&workspace->strm, wbits);
if (unlikely(ret != Z_OK)) {
struct btrfs_inode *inode = folio_to_inode(dest_folio);
btrfs_err(inode->root->fs_info,
"zlib decompression init failed, error %d root %llu inode %llu offset %llu",
ret, btrfs_root_id(inode->root), btrfs_ino(inode),
folio_pos(dest_folio));
return -EIO;
}
/*
* Everything (in/out buf) should be at most one sector, there should
* be no need to switch any input/output buffer.
*/
ret = zlib_inflate(&workspace->strm, Z_FINISH);
to_copy = min(workspace->strm.total_out, destlen);
if (ret != Z_STREAM_END)
goto out;
memcpy_to_folio(dest_folio, dest_pgoff, workspace->buf, to_copy);
out:
if (unlikely(to_copy != destlen)) {
struct btrfs_inode *inode = folio_to_inode(dest_folio);
btrfs_err(inode->root->fs_info,
"zlib decompression failed, error %d root %llu inode %llu offset %llu decompressed %lu expected %zu",
ret, btrfs_root_id(inode->root), btrfs_ino(inode),
folio_pos(dest_folio), to_copy, destlen);
ret = -EIO;
} else {
ret = 0;
}
zlib_inflateEnd(&workspace->strm);
if (unlikely(to_copy < destlen))
folio_zero_range(dest_folio, dest_pgoff + to_copy, destlen - to_copy);
return ret;
}
const struct btrfs_compress_op btrfs_zlib_compress = {
.workspace_manager = &wsm,
.min_level = 1,
.max_level = 9,
.default_level = BTRFS_ZLIB_DEFAULT_LEVEL,
};