Add support for expiring files
SUPPLEMENTALLY: - Add an `expiration` field to the `file` table of the database - Produce a migration for the above change - Overhaul the cleanup script, and integrate into fhost.py (now run using FLASK_APP=fhost flask prune) - Replace the old cleanup script with a deprecation notice - Add information about how to expire files to the index - Update the README with information about the new script Squashed commits: Add a note explaining that expired files aren't immediately removed Show correct times on the index page graph Improve the migration script, removing the need for --legacy Use automap in place of an explicit file map in migration Remove vestigial `touch()` Don't crash when upgrading a fresh database Remove vestigial warning about legacy files More efficiently filter to unexpired files when migrating https://git.0x0.st/mia/0x0/pulls/72#issuecomment-224 Coalesce updates to the database during migration https://git.0x0.st/mia/0x0/pulls/72#issuecomment-226 Remove vestigial database model https://git.0x0.st/mia/0x0/pulls/72#issuecomment-261 prune: Stream expired files from the database (as opposed to collecting them all first) config.example.py: Add min & max expiration + description
This commit is contained in:
parent
00dba0e189
commit
af4b3b06c0
6 changed files with 269 additions and 57 deletions
|
@ -35,8 +35,8 @@ downsides, one of them being that range requests will not work. This is a
|
|||
problem for example when streaming media files: It won’t be possible to seek,
|
||||
and some ISOBMFF (MP4) files will not play at all.
|
||||
|
||||
To make files expire, simply create a cronjob that runs ``cleanup.py`` every
|
||||
now and then.
|
||||
To make files expire, simply create a cronjob that runs ``FLASK_APP=fhost
|
||||
flask prune`` every now and then.
|
||||
|
||||
Before running the service for the first time, run ``FLASK_APP=fhost flask db upgrade``.
|
||||
|
||||
|
|
48
cleanup.py
48
cleanup.py
|
@ -1,44 +1,8 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
Copyright © 2020 Mia Herkt
|
||||
Licensed under the EUPL, Version 1.2 or - as soon as approved
|
||||
by the European Commission - subsequent versions of the EUPL
|
||||
(the "License");
|
||||
You may not use this work except in compliance with the License.
|
||||
You may obtain a copy of the license at:
|
||||
|
||||
https://joinup.ec.europa.eu/software/page/eupl
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" basis, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
either express or implied.
|
||||
See the License for the specific language governing permissions
|
||||
and limitations under the License.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import datetime
|
||||
from fhost import app
|
||||
|
||||
os.chdir(os.path.dirname(sys.argv[0]))
|
||||
os.chdir(app.config["FHOST_STORAGE_PATH"])
|
||||
|
||||
files = [f for f in os.listdir(".")]
|
||||
|
||||
maxs = app.config["MAX_CONTENT_LENGTH"]
|
||||
mind = 30
|
||||
maxd = 365
|
||||
|
||||
for f in files:
|
||||
stat = os.stat(f)
|
||||
systime = time.time()
|
||||
age = datetime.timedelta(seconds=(systime - stat.st_mtime)).days
|
||||
|
||||
maxage = mind + (-maxd + mind) * (stat.st_size / maxs - 1) ** 3
|
||||
|
||||
if age >= maxage:
|
||||
os.remove(f)
|
||||
print("This script has been replaced!!")
|
||||
print("Instead, please run")
|
||||
print("")
|
||||
print(" $ FLASK_APP=fhost flask prune")
|
||||
print("")
|
||||
exit(1);
|
||||
|
|
165
fhost.py
165
fhost.py
|
@ -22,12 +22,17 @@
|
|||
from flask import Flask, abort, make_response, redirect, request, send_from_directory, url_for, Response, render_template
|
||||
from flask_sqlalchemy import SQLAlchemy
|
||||
from flask_migrate import Migrate
|
||||
from sqlalchemy import and_
|
||||
from jinja2.exceptions import *
|
||||
from jinja2 import ChoiceLoader, FileSystemLoader
|
||||
from hashlib import sha256
|
||||
from magic import Magic
|
||||
from mimetypes import guess_extension
|
||||
import click
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import typing
|
||||
import requests
|
||||
from validators import url as url_valid
|
||||
from pathlib import Path
|
||||
|
@ -121,12 +126,14 @@ class File(db.Model):
|
|||
addr = db.Column(db.UnicodeText)
|
||||
removed = db.Column(db.Boolean, default=False)
|
||||
nsfw_score = db.Column(db.Float)
|
||||
expiration = db.Column(db.BigInteger)
|
||||
|
||||
def __init__(self, sha256, ext, mime, addr):
|
||||
def __init__(self, sha256, ext, mime, addr, expiration):
|
||||
self.sha256 = sha256
|
||||
self.ext = ext
|
||||
self.mime = mime
|
||||
self.addr = addr
|
||||
self.expiration = expiration
|
||||
|
||||
def getname(self):
|
||||
return u"{0}{1}".format(su.enbase(self.id), self.ext)
|
||||
|
@ -139,7 +146,16 @@ class File(db.Model):
|
|||
else:
|
||||
return url_for("get", path=n, _external=True) + "\n"
|
||||
|
||||
def store(file_, addr):
|
||||
"""
|
||||
requested_expiration can be:
|
||||
- None, to use the longest allowed file lifespan
|
||||
- a duration (in hours) that the file should live for
|
||||
- a timestamp in epoch millis that the file should expire at
|
||||
|
||||
Any value greater that the longest allowed file lifespan will be rounded down to that
|
||||
value.
|
||||
"""
|
||||
def store(file_, requested_expiration: typing.Optional[int], addr):
|
||||
data = file_.read()
|
||||
digest = sha256(data).hexdigest()
|
||||
|
||||
|
@ -175,15 +191,51 @@ class File(db.Model):
|
|||
|
||||
return ext[:app.config["FHOST_MAX_EXT_LENGTH"]] or ".bin"
|
||||
|
||||
f = File.query.filter_by(sha256=digest).first()
|
||||
# Returns the epoch millisecond that this file should expire
|
||||
#
|
||||
# Uses the expiration time provided by the user (requested_expiration)
|
||||
# upper-bounded by an algorithm that computes the size based on the size of the
|
||||
# file.
|
||||
#
|
||||
# That is, all files are assigned a computed expiration, which can voluntarily
|
||||
# shortened by the user either by providing a timestamp in epoch millis or a
|
||||
# duration in hours.
|
||||
def get_expiration() -> int:
|
||||
current_epoch_millis = time.time() * 1000;
|
||||
|
||||
# Maximum lifetime of the file in milliseconds
|
||||
this_files_max_lifespan = get_max_lifespan(len(data));
|
||||
|
||||
# The latest allowed expiration date for this file, in epoch millis
|
||||
this_files_max_expiration = this_files_max_lifespan + 1000 * time.time();
|
||||
|
||||
if requested_expiration is None:
|
||||
return this_files_max_expiration
|
||||
elif requested_expiration < 1650460320000:
|
||||
# Treat the requested expiration time as a duration in hours
|
||||
requested_expiration_ms = requested_expiration * 60 * 60 * 1000
|
||||
return min(this_files_max_expiration, current_epoch_millis + requested_expiration_ms)
|
||||
else:
|
||||
# Treat the requested expiration time as a timestamp in epoch millis
|
||||
return min(this_files_max_expiration, requested_expiration);
|
||||
|
||||
f = File.query.filter_by(sha256=digest).first()
|
||||
if f:
|
||||
# If the file already exists
|
||||
if f.removed:
|
||||
# The file was removed by moderation, so don't accept it back
|
||||
abort(451)
|
||||
if f.expiration is None:
|
||||
# The file has expired, so give it a new expiration date
|
||||
f.expiration = get_expiration()
|
||||
else:
|
||||
# The file already exists, update the expiration if needed
|
||||
f.expiration = max(f.expiration, get_expiration())
|
||||
else:
|
||||
mime = get_mime()
|
||||
ext = get_ext(mime)
|
||||
f = File(digest, ext, mime, addr)
|
||||
expiration = get_expiration()
|
||||
f = File(digest, ext, mime, addr, expiration)
|
||||
|
||||
f.addr = addr
|
||||
|
||||
|
@ -194,8 +246,6 @@ class File(db.Model):
|
|||
if not p.is_file():
|
||||
with open(p, "wb") as of:
|
||||
of.write(data)
|
||||
else:
|
||||
p.touch()
|
||||
|
||||
if not f.nsfw_score and app.config["NSFW_DETECT"]:
|
||||
f.nsfw_score = nsfw.detect(p)
|
||||
|
@ -260,11 +310,20 @@ def in_upload_bl(addr):
|
|||
|
||||
return False
|
||||
|
||||
def store_file(f, addr):
|
||||
"""
|
||||
requested_expiration can be:
|
||||
- None, to use the longest allowed file lifespan
|
||||
- a duration (in hours) that the file should live for
|
||||
- a timestamp in epoch millis that the file should expire at
|
||||
|
||||
Any value greater that the longest allowed file lifespan will be rounded down to that
|
||||
value.
|
||||
"""
|
||||
def store_file(f, requested_expiration: typing.Optional[int], addr):
|
||||
if in_upload_bl(addr):
|
||||
return "Your host is blocked from uploading files.\n", 451
|
||||
|
||||
sf = File.store(f, addr)
|
||||
sf = File.store(f, requested_expiration, addr)
|
||||
|
||||
return sf.geturl()
|
||||
|
||||
|
@ -289,7 +348,7 @@ def store_url(url, addr):
|
|||
|
||||
f = urlfile(read=r.raw.read, content_type=r.headers["content-type"], filename="")
|
||||
|
||||
return store_file(f, addr)
|
||||
return store_file(f, None, addr)
|
||||
else:
|
||||
abort(413)
|
||||
else:
|
||||
|
@ -336,7 +395,23 @@ def fhost():
|
|||
sf = None
|
||||
|
||||
if "file" in request.files:
|
||||
return store_file(request.files["file"], request.remote_addr)
|
||||
try:
|
||||
# Store the file with the requested expiration date
|
||||
return store_file(
|
||||
request.files["file"],
|
||||
int(request.form["expires"]),
|
||||
request.remote_addr
|
||||
)
|
||||
except ValueError:
|
||||
# The requested expiration date wasn't properly formed
|
||||
abort(400)
|
||||
except KeyError:
|
||||
# No expiration date was requested, store with the max lifespan
|
||||
return store_file(
|
||||
request.files["file"],
|
||||
None,
|
||||
request.remote_addr
|
||||
)
|
||||
elif "url" in request.form:
|
||||
return store_url(request.form["url"], request.remote_addr)
|
||||
elif "shorten" in request.form:
|
||||
|
@ -364,3 +439,73 @@ def ehandler(e):
|
|||
return render_template(f"{e.code}.html", id=id), e.code
|
||||
except TemplateNotFound:
|
||||
return "Segmentation fault\n", e.code
|
||||
|
||||
@app.cli.command("prune")
|
||||
def prune():
|
||||
"""
|
||||
Clean up expired files
|
||||
|
||||
Deletes any files from the filesystem which have hit their expiration time. This
|
||||
doesn't remove them from the database, only from the filesystem. It's recommended
|
||||
that server owners run this command regularly, or set it up on a timer.
|
||||
"""
|
||||
current_time = time.time() * 1000;
|
||||
|
||||
# The path to where uploaded files are stored
|
||||
storage = Path(app.config["FHOST_STORAGE_PATH"])
|
||||
|
||||
# A list of all files who've passed their expiration times
|
||||
expired_files = File.query\
|
||||
.where(
|
||||
and_(
|
||||
File.expiration.is_not(None),
|
||||
File.expiration < current_time
|
||||
)
|
||||
)
|
||||
|
||||
files_removed = 0;
|
||||
|
||||
# For every expired file...
|
||||
for file in expired_files:
|
||||
# Log the file we're about to remove
|
||||
file_name = file.getname()
|
||||
file_hash = file.sha256
|
||||
file_path = storage / file_hash
|
||||
print(f"Removing expired file {file_name} [{file_hash}]")
|
||||
|
||||
# Remove it from the file system
|
||||
try:
|
||||
os.remove(file_path)
|
||||
files_removed += 1;
|
||||
except FileNotFoundError:
|
||||
pass # If the file was already gone, we're good
|
||||
except OSError as e:
|
||||
print(e)
|
||||
print(
|
||||
"\n------------------------------------"
|
||||
"Encountered an error while trying to remove file {file_path}. Double"
|
||||
"check to make sure the server is configured correctly, permissions are"
|
||||
"okay, and everything is ship shape, then try again.")
|
||||
return;
|
||||
|
||||
# Finally, mark that the file was removed
|
||||
file.expiration = None;
|
||||
db.session.commit()
|
||||
|
||||
print(f"\nDone! {files_removed} file(s) removed")
|
||||
|
||||
""" For a file of a given size, determine the largest allowed lifespan of that file
|
||||
|
||||
Based on the current app's configuration: Specifically, the MAX_CONTENT_LENGTH, as well
|
||||
as FHOST_{MIN,MAX}_EXPIRATION.
|
||||
|
||||
This lifespan may be shortened by a user's request, but no files should be allowed to
|
||||
expire at a point after this number.
|
||||
|
||||
Value returned is a duration in milliseconds.
|
||||
"""
|
||||
def get_max_lifespan(filesize: int) -> int:
|
||||
min_exp = app.config.get("FHOST_MIN_EXPIRATION", 30 * 24 * 60 * 60 * 1000)
|
||||
max_exp = app.config.get("FHOST_MAX_EXPIRATION", 365 * 24 * 60 * 60 * 1000)
|
||||
max_size = app.config.get("MAX_CONTENT_LENGTH", 256 * 1024 * 1024)
|
||||
return min_exp + int((-max_exp + min_exp) * (filesize / max_size - 1) ** 3)
|
||||
|
|
|
@ -45,6 +45,19 @@ MAX_CONTENT_LENGTH = 256 * 1024 * 1024 # Default: 256MiB
|
|||
MAX_URL_LENGTH = 4096
|
||||
|
||||
|
||||
# The minimum and maximum amount of time we'll retain a file for
|
||||
#
|
||||
# Small files (nearing zero bytes) are stored for the longest possible expiration date,
|
||||
# while larger files (nearing MAX_CONTENT_LENGTH bytes) are stored for the shortest amount
|
||||
# of time. Values between these two extremes are interpolated with an exponential curve,
|
||||
# like the one shown on the index page.
|
||||
#
|
||||
# All times are in milliseconds. If you want all files to be stored for the same amount
|
||||
# of time, set these to the same value.
|
||||
FHOST_MIN_EXPIRATION = 30 * 24 * 60 * 60 * 1000
|
||||
FHOST_MAX_EXPIRATION = 365 * 24 * 60 * 60 * 1000
|
||||
|
||||
|
||||
# Use the X-SENDFILE header to speed up serving files w/ compatible webservers
|
||||
#
|
||||
# Some webservers can be configured use the X-Sendfile header to handle sending
|
||||
|
|
81
migrations/versions/939a08e1d6e5_.py
Normal file
81
migrations/versions/939a08e1d6e5_.py
Normal file
|
@ -0,0 +1,81 @@
|
|||
"""add file expirations
|
||||
|
||||
Revision ID: 939a08e1d6e5
|
||||
Revises: 7e246705da6a
|
||||
Create Date: 2022-11-22 12:16:32.517184
|
||||
|
||||
"""
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = '939a08e1d6e5'
|
||||
down_revision = '7e246705da6a'
|
||||
|
||||
from alembic import op
|
||||
from flask import current_app
|
||||
from flask_sqlalchemy import SQLAlchemy
|
||||
from pathlib import Path
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.ext.automap import automap_base
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
import os
|
||||
import time
|
||||
|
||||
""" For a file of a given size, determine the largest allowed lifespan of that file
|
||||
|
||||
Based on the current app's configuration: Specifically, the MAX_CONTENT_LENGTH, as well
|
||||
as FHOST_{MIN,MAX}_EXPIRATION.
|
||||
|
||||
This lifespan may be shortened by a user's request, but no files should be allowed to
|
||||
expire at a point after this number.
|
||||
|
||||
Value returned is a duration in milliseconds.
|
||||
"""
|
||||
def get_max_lifespan(filesize: int) -> int:
|
||||
min_exp = current_app.config.get("FHOST_MIN_EXPIRATION", 30 * 24 * 60 * 60 * 1000)
|
||||
max_exp = current_app.config.get("FHOST_MAX_EXPIRATION", 365 * 24 * 60 * 60 * 1000)
|
||||
max_size = current_app.config.get("MAX_CONTENT_LENGTH", 256 * 1024 * 1024)
|
||||
return min_exp + int((-max_exp + min_exp) * (filesize / max_size - 1) ** 3)
|
||||
|
||||
Base = automap_base()
|
||||
|
||||
def upgrade():
|
||||
op.add_column('file', sa.Column('expiration', sa.BigInteger()))
|
||||
|
||||
bind = op.get_bind()
|
||||
Base.prepare(autoload_with=bind)
|
||||
File = Base.classes.file
|
||||
session = Session(bind=bind)
|
||||
|
||||
storage = Path(current_app.config["FHOST_STORAGE_PATH"])
|
||||
current_time = time.time() * 1000;
|
||||
|
||||
# List of file hashes which have not expired yet
|
||||
# This could get really big for some servers
|
||||
try:
|
||||
unexpired_files = os.listdir(storage)
|
||||
except FileNotFoundError:
|
||||
return # There are no currently unexpired files
|
||||
|
||||
# Calculate an expiration date for all existing files
|
||||
files = session.scalars(
|
||||
sa.select(File)
|
||||
.where(
|
||||
sa.not_(File.removed),
|
||||
File.sha256.in_(unexpired_files)
|
||||
)
|
||||
)
|
||||
updates = [] # We coalesce updates to the database here
|
||||
for file in files:
|
||||
file_path = storage / file.sha256
|
||||
stat = os.stat(file_path)
|
||||
max_age = get_max_lifespan(stat.st_size) # How long the file is allowed to live, in ms
|
||||
file_birth = stat.st_mtime * 1000 # When the file was created, in ms
|
||||
updates.append({'id': file.id, 'expiration': int(file_birth + max_age)})
|
||||
|
||||
# Apply coalesced updates
|
||||
session.bulk_update_mappings(File, updates)
|
||||
session.commit()
|
||||
|
||||
def downgrade():
|
||||
op.drop_column('file', 'expiration')
|
|
@ -11,6 +11,15 @@ Or you can shorten URLs:
|
|||
|
||||
File URLs are valid for at least 30 days and up to a year (see below).
|
||||
Shortened URLs do not expire.
|
||||
|
||||
Files can be set to expire sooner by adding an "expires" parameter (in hours)
|
||||
curl -F'file=@yourfile.png' -F'expires=24' {{ fhost_url }}
|
||||
OR by setting "expires" to a timestamp in epoch milliseconds
|
||||
curl -F'file=@yourfile.png' -F'expires=1681996320000' {{ fhost_url }}
|
||||
|
||||
Expired files won't be removed immediately, but will be removed as part of
|
||||
the next purge.
|
||||
|
||||
{% set max_size = config["MAX_CONTENT_LENGTH"]|filesizeformat(True) %}
|
||||
Maximum file size: {{ max_size }}
|
||||
Not allowed: {{ config["FHOST_MIME_BLACKLIST"]|join(", ") }}
|
||||
|
@ -22,7 +31,7 @@ FILE RETENTION PERIOD
|
|||
retention = min_age + (-max_age + min_age) * pow((file_size / max_size - 1), 3)
|
||||
|
||||
days
|
||||
365 | \\
|
||||
{{'{: 6}'.format(config.get("FHOST_MAX_EXPIRATION", 31536000000)//86400000)}} | \\
|
||||
| \\
|
||||
| \\
|
||||
| \\
|
||||
|
@ -30,7 +39,7 @@ retention = min_age + (-max_age + min_age) * pow((file_size / max_size - 1), 3)
|
|||
| \\
|
||||
| ..
|
||||
| \\
|
||||
197.5 | ----------..-------------------------------------------
|
||||
{{'{: 6.1f}'.format((config.get("FHOST_MIN_EXPIRATION", 2592000000)/2 + config.get("FHOST_MAX_EXPIRATION", 31536000000)/2)/86400000)}} | ----------..-------------------------------------------
|
||||
| ..
|
||||
| \\
|
||||
| ..
|
||||
|
@ -39,7 +48,7 @@ retention = min_age + (-max_age + min_age) * pow((file_size / max_size - 1), 3)
|
|||
| ...
|
||||
| ....
|
||||
| ......
|
||||
30 | ....................
|
||||
{{'{: 6}'.format(config.get("FHOST_MIN_EXPIRATION", 2592000000)//86400000)}} | ....................
|
||||
0{{ ((config["MAX_CONTENT_LENGTH"]/2)|filesizeformat(True)).split(" ")[0].rjust(27) }}{{ max_size.split(" ")[0].rjust(27) }}
|
||||
{{ max_size.split(" ")[1].rjust(54) }}
|
||||
</pre>
|
||||
|
|
Loading…
Reference in a new issue