aboutsummaryrefslogtreecommitdiff
path: root/src/backend/storage/file/fd.c
diff options
context:
space:
mode:
authorAndres Freund <andres@anarazel.de>2017-12-01 16:30:56 -0800
committerAndres Freund <andres@anarazel.de>2017-12-01 16:30:56 -0800
commitdc6c4c9dc2a111519b76b22daaaac86c5608223b (patch)
treeb9ac520d1aa9163ae52755ee2bc1a549df0c6a57 /src/backend/storage/file/fd.c
parent35438e5763c3021e579472e4b0c4a4d6038570b4 (diff)
downloadpostgresql-dc6c4c9dc2a111519b76b22daaaac86c5608223b.tar.gz
postgresql-dc6c4c9dc2a111519b76b22daaaac86c5608223b.zip
Add infrastructure for sharing temporary files between backends.
SharedFileSet allows temporary files to be created by one backend and then exported for read-only access by other backends, with clean-up managed by reference counting associated with a DSM segment. This includes changes to fd.c and buffile.c to support the new kind of temporary file. This will be used by an upcoming patch adding support for parallel hash joins. Author: Thomas Munro Reviewed-By: Peter Geoghegan, Andres Freund, Robert Haas, Rushabh Lathia Discussion: https://postgr.es/m/CAEepm=2W=cOkiZxcg6qiFQP-dHUe09aqTrEMM7yJDrHMhDv_RA@mail.gmail.com https://postgr.es/m/CAH2-WznJ_UgLux=_jTgCQ4yFz0iBntudsNKa1we3kN1BAG=88w@mail.gmail.com
Diffstat (limited to 'src/backend/storage/file/fd.c')
-rw-r--r--src/backend/storage/file/fd.c395
1 files changed, 336 insertions, 59 deletions
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index aa2fe2c6c04..2e93e4ad632 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -39,6 +39,14 @@
* for a long time, like relation files. It is the caller's responsibility
* to close them, there is no automatic mechanism in fd.c for that.
*
+ * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
+ * temporary files that have names so that they can be shared between
+ * backends. Such files are automatically closed and count against the
+ * temporary file limit of the backend that creates them, but unlike anonymous
+ * files they are not automatically deleted. See sharedfileset.c for a shared
+ * ownership mechanism that provides automatic cleanup for shared files when
+ * the last of a group of backends detaches.
+ *
* AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
* wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
* They behave like the corresponding native functions, except that the handle
@@ -175,8 +183,9 @@ int max_safe_fds = 32; /* default if not changed */
#define FilePosIsUnknown(pos) ((pos) < 0)
/* these are the assigned bits in fdstate below: */
-#define FD_TEMPORARY (1 << 0) /* T = delete when closed */
-#define FD_XACT_TEMPORARY (1 << 1) /* T = delete at eoXact */
+#define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
+#define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
+#define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
typedef struct vfd
{
@@ -313,7 +322,7 @@ static struct dirent *ReadDirExtended(DIR *dir, const char *dirname, int elevel)
static void AtProcExit_Files(int code, Datum arg);
static void CleanupTempFiles(bool isProcExit);
-static void RemovePgTempFilesInDir(const char *tmpdirname);
+static void RemovePgTempFilesInDir(const char *tmpdirname, bool unlink_all);
static void RemovePgTempRelationFiles(const char *tsdirname);
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
static bool looks_like_temp_rel_name(const char *name);
@@ -326,6 +335,7 @@ static void walkdir(const char *path,
static void pre_sync_fname(const char *fname, bool isdir, int elevel);
#endif
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
+static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
static int fsync_parent_path(const char *fname, int elevel);
@@ -1295,6 +1305,39 @@ FileAccess(File file)
}
/*
+ * Called whenever a temporary file is deleted to report its size.
+ */
+static void
+ReportTemporaryFileUsage(const char *path, off_t size)
+{
+ pgstat_report_tempfile(size);
+
+ if (log_temp_files >= 0)
+ {
+ if ((size / 1024) >= log_temp_files)
+ ereport(LOG,
+ (errmsg("temporary file: path \"%s\", size %lu",
+ path, (unsigned long) size)));
+ }
+}
+
+/*
+ * Called to register a temporary file for automatic close.
+ * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
+ * before the file was opened.
+ */
+static void
+RegisterTemporaryFile(File file)
+{
+ ResourceOwnerRememberFile(CurrentResourceOwner, file);
+ VfdCache[file].resowner = CurrentResourceOwner;
+
+ /* Backup mechanism for closing at end of xact. */
+ VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
+ have_xact_temporary_files = true;
+}
+
+/*
* Called when we get a shared invalidation message on some relation.
*/
#ifdef NOT_USED
@@ -1379,6 +1422,67 @@ PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
}
/*
+ * Create directory 'directory'. If necessary, create 'basedir', which must
+ * be the directory above it. This is designed for creating the top-level
+ * temporary directory on demand before creating a directory underneath it.
+ * Do nothing if the directory already exists.
+ *
+ * Directories created within the top-level temporary directory should begin
+ * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
+ * deleted at startup by RemovePgTempFiles(). Further subdirectories below
+ * that do not need any particular prefix.
+*/
+void
+PathNameCreateTemporaryDir(const char *basedir, const char *directory)
+{
+ if (mkdir(directory, S_IRWXU) < 0)
+ {
+ if (errno == EEXIST)
+ return;
+
+ /*
+ * Failed. Try to create basedir first in case it's missing. Tolerate
+ * EEXIST to close a race against another process following the same
+ * algorithm.
+ */
+ if (mkdir(basedir, S_IRWXU) < 0 && errno != EEXIST)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("cannot create temporary directory \"%s\": %m",
+ basedir)));
+
+ /* Try again. */
+ if (mkdir(directory, S_IRWXU) < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("cannot create temporary subdirectory \"%s\": %m",
+ directory)));
+ }
+}
+
+/*
+ * Delete a directory and everything in it, if it exists.
+ */
+void
+PathNameDeleteTemporaryDir(const char *dirname)
+{
+ struct stat statbuf;
+
+ /* Silently ignore missing directory. */
+ if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
+ return;
+
+ /*
+ * Currently, walkdir doesn't offer a way for our passed in function to
+ * maintain state. Perhaps it should, so that we could tell the caller
+ * whether this operation succeeded or failed. Since this operation is
+ * used in a cleanup path, we wouldn't actually behave differently: we'll
+ * just log failures.
+ */
+ walkdir(dirname, unlink_if_exists_fname, false, LOG);
+}
+
+/*
* Open a temporary file that will disappear when we close it.
*
* This routine takes care of generating an appropriate tempfile name.
@@ -1432,53 +1536,52 @@ OpenTemporaryFile(bool interXact)
DEFAULTTABLESPACE_OID,
true);
- /* Mark it for deletion at close */
- VfdCache[file].fdstate |= FD_TEMPORARY;
+ /* Mark it for deletion at close and temporary file size limit */
+ VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
/* Register it with the current resource owner */
if (!interXact)
- {
- VfdCache[file].fdstate |= FD_XACT_TEMPORARY;
-
- VfdCache[file].resowner = CurrentResourceOwner;
- ResourceOwnerRememberFile(CurrentResourceOwner, file);
-
- /* ensure cleanup happens at eoxact */
- have_xact_temporary_files = true;
- }
+ RegisterTemporaryFile(file);
return file;
}
/*
- * Open a temporary file in a specific tablespace.
- * Subroutine for OpenTemporaryFile, which see for details.
+ * Return the path of the temp directory in a given tablespace.
*/
-static File
-OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
+void
+TempTablespacePath(char *path, Oid tablespace)
{
- char tempdirpath[MAXPGPATH];
- char tempfilepath[MAXPGPATH];
- File file;
-
/*
* Identify the tempfile directory for this tablespace.
*
* If someone tries to specify pg_global, use pg_default instead.
*/
- if (tblspcOid == DEFAULTTABLESPACE_OID ||
- tblspcOid == GLOBALTABLESPACE_OID)
- {
- /* The default tablespace is {datadir}/base */
- snprintf(tempdirpath, sizeof(tempdirpath), "base/%s",
- PG_TEMP_FILES_DIR);
- }
+ if (tablespace == InvalidOid ||
+ tablespace == DEFAULTTABLESPACE_OID ||
+ tablespace == GLOBALTABLESPACE_OID)
+ snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
else
{
/* All other tablespaces are accessed via symlinks */
- snprintf(tempdirpath, sizeof(tempdirpath), "pg_tblspc/%u/%s/%s",
- tblspcOid, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
+ snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
+ tablespace, TABLESPACE_VERSION_DIRECTORY,
+ PG_TEMP_FILES_DIR);
}
+}
+
+/*
+ * Open a temporary file in a specific tablespace.
+ * Subroutine for OpenTemporaryFile, which see for details.
+ */
+static File
+OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
+{
+ char tempdirpath[MAXPGPATH];
+ char tempfilepath[MAXPGPATH];
+ File file;
+
+ TempTablespacePath(tempdirpath, tblspcOid);
/*
* Generate a tempfile name that should be unique within the current
@@ -1515,6 +1618,130 @@ OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
return file;
}
+
+/*
+ * Create a new file. The directory containing it must already exist. Files
+ * created this way are subject to temp_file_limit and are automatically
+ * closed at end of transaction, but are not automatically deleted on close
+ * because they are intended to be shared between cooperating backends.
+ *
+ * If the file is inside the top-level temporary directory, its name should
+ * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
+ * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
+ * inside a directory created with PathnameCreateTemporaryDir(), in which case
+ * the prefix isn't needed.
+ */
+File
+PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
+{
+ File file;
+
+ ResourceOwnerEnlargeFiles(CurrentResourceOwner);
+
+ /*
+ * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
+ * temp file that can be reused.
+ */
+ file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
+ if (file <= 0)
+ {
+ if (error_on_failure)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not create temporary file \"%s\": %m",
+ path)));
+ else
+ return file;
+ }
+
+ /* Mark it for temp_file_limit accounting. */
+ VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
+
+ /* Register it for automatic close. */
+ RegisterTemporaryFile(file);
+
+ return file;
+}
+
+/*
+ * Open a file that was created with PathNameCreateTemporaryFile, possibly in
+ * another backend. Files opened this way don't count against the
+ * temp_file_limit of the caller, are read-only and are automatically closed
+ * at the end of the transaction but are not deleted on close.
+ */
+File
+PathNameOpenTemporaryFile(const char *path)
+{
+ File file;
+
+ ResourceOwnerEnlargeFiles(CurrentResourceOwner);
+
+ /* We open the file read-only. */
+ file = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
+
+ /* If no such file, then we don't raise an error. */
+ if (file <= 0 && errno != ENOENT)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open temporary file \"%s\": %m",
+ path)));
+
+ if (file > 0)
+ {
+ /* Register it for automatic close. */
+ RegisterTemporaryFile(file);
+ }
+
+ return file;
+}
+
+/*
+ * Delete a file by pathname. Return true if the file existed, false if
+ * didn't.
+ */
+bool
+PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
+{
+ struct stat filestats;
+ int stat_errno;
+
+ /* Get the final size for pgstat reporting. */
+ if (stat(path, &filestats) != 0)
+ stat_errno = errno;
+ else
+ stat_errno = 0;
+
+ /*
+ * Unlike FileClose's automatic file deletion code, we tolerate
+ * non-existence to support BufFileDeleteShared which doesn't know how
+ * many segments it has to delete until it runs out.
+ */
+ if (stat_errno == ENOENT)
+ return false;
+
+ if (unlink(path) < 0)
+ {
+ if (errno != ENOENT)
+ ereport(error_on_failure ? ERROR : LOG,
+ (errcode_for_file_access(),
+ errmsg("cannot unlink temporary file \"%s\": %m",
+ path)));
+ return false;
+ }
+
+ if (stat_errno == 0)
+ ReportTemporaryFileUsage(path, filestats.st_size);
+ else
+ {
+ errno = stat_errno;
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not stat file \"%s\": %m", path)));
+ }
+
+ return true;
+}
+
/*
* close a file when done with it
*/
@@ -1543,10 +1770,17 @@ FileClose(File file)
Delete(file);
}
+ if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
+ {
+ /* Subtract its size from current usage (do first in case of error) */
+ temporary_files_size -= vfdP->fileSize;
+ vfdP->fileSize = 0;
+ }
+
/*
* Delete the file if it was temporary, and make a log entry if wanted
*/
- if (vfdP->fdstate & FD_TEMPORARY)
+ if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
{
struct stat filestats;
int stat_errno;
@@ -1558,11 +1792,8 @@ FileClose(File file)
* is arranged to ensure that the worst-case consequence is failing to
* emit log message(s), not failing to attempt the unlink.
*/
- vfdP->fdstate &= ~FD_TEMPORARY;
+ vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
- /* Subtract its size from current usage (do first in case of error) */
- temporary_files_size -= vfdP->fileSize;
- vfdP->fileSize = 0;
/* first try the stat() */
if (stat(vfdP->fileName, &filestats))
@@ -1576,18 +1807,7 @@ FileClose(File file)
/* and last report the stat results */
if (stat_errno == 0)
- {
- pgstat_report_tempfile(filestats.st_size);
-
- if (log_temp_files >= 0)
- {
- if ((filestats.st_size / 1024) >= log_temp_files)
- ereport(LOG,
- (errmsg("temporary file: path \"%s\", size %lu",
- vfdP->fileName,
- (unsigned long) filestats.st_size)));
- }
- }
+ ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
else
{
errno = stat_errno;
@@ -1761,7 +1981,7 @@ FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
* message if we do that. All current callers would just throw error
* immediately anyway, so this is safe at present.
*/
- if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMPORARY))
+ if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
{
off_t newPos;
@@ -1814,7 +2034,7 @@ retry:
* get here in that state if we're not enforcing temporary_files_size,
* so we don't care.
*/
- if (vfdP->fdstate & FD_TEMPORARY)
+ if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
{
off_t newPos = vfdP->seekPos;
@@ -1985,7 +2205,7 @@ FileTruncate(File file, off_t offset, uint32 wait_event_info)
if (returnCode == 0 && VfdCache[file].fileSize > offset)
{
/* adjust our state for truncation of a temp file */
- Assert(VfdCache[file].fdstate & FD_TEMPORARY);
+ Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
temporary_files_size -= VfdCache[file].fileSize - offset;
VfdCache[file].fileSize = offset;
}
@@ -2594,6 +2814,24 @@ TempTablespacesAreSet(void)
}
/*
+ * GetTempTablespaces
+ *
+ * Populate an array with the OIDs of the tablespaces that should be used for
+ * temporary files. Return the number that were copied into the output array.
+ */
+int
+GetTempTablespaces(Oid *tableSpaces, int numSpaces)
+{
+ int i;
+
+ Assert(TempTablespacesAreSet());
+ for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
+ tableSpaces[i] = tempTableSpaces[i];
+
+ return i;
+}
+
+/*
* GetNextTempTableSpace
*
* Select the next temp tablespace to use. A result of InvalidOid means
@@ -2696,7 +2934,8 @@ CleanupTempFiles(bool isProcExit)
{
unsigned short fdstate = VfdCache[i].fdstate;
- if ((fdstate & FD_TEMPORARY) && VfdCache[i].fileName != NULL)
+ if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
+ VfdCache[i].fileName != NULL)
{
/*
* If we're in the process of exiting a backend process, close
@@ -2707,7 +2946,7 @@ CleanupTempFiles(bool isProcExit)
*/
if (isProcExit)
FileClose(i);
- else if (fdstate & FD_XACT_TEMPORARY)
+ else if (fdstate & FD_CLOSE_AT_EOXACT)
{
elog(WARNING,
"temporary file %s not closed at end-of-transaction",
@@ -2751,7 +2990,7 @@ RemovePgTempFiles(void)
* First process temp files in pg_default ($PGDATA/base)
*/
snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
- RemovePgTempFilesInDir(temp_path);
+ RemovePgTempFilesInDir(temp_path, false);
RemovePgTempRelationFiles("base");
/*
@@ -2767,7 +3006,7 @@ RemovePgTempFiles(void)
snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
- RemovePgTempFilesInDir(temp_path);
+ RemovePgTempFilesInDir(temp_path, false);
snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
@@ -2785,9 +3024,15 @@ RemovePgTempFiles(void)
#endif
}
-/* Process one pgsql_tmp directory for RemovePgTempFiles */
+/*
+ * Process one pgsql_tmp directory for RemovePgTempFiles. At the top level in
+ * each tablespace, this should be called with unlink_all = false, so that
+ * only files matching the temporary name prefix will be unlinked. When
+ * recursing it will be called with unlink_all = true to unlink everything
+ * under a top-level temporary directory.
+ */
static void
-RemovePgTempFilesInDir(const char *tmpdirname)
+RemovePgTempFilesInDir(const char *tmpdirname, bool unlink_all)
{
DIR *temp_dir;
struct dirent *temp_de;
@@ -2813,10 +3058,25 @@ RemovePgTempFilesInDir(const char *tmpdirname)
snprintf(rm_path, sizeof(rm_path), "%s/%s",
tmpdirname, temp_de->d_name);
- if (strncmp(temp_de->d_name,
+ if (unlink_all ||
+ strncmp(temp_de->d_name,
PG_TEMP_FILE_PREFIX,
strlen(PG_TEMP_FILE_PREFIX)) == 0)
- unlink(rm_path); /* note we ignore any error */
+ {
+ struct stat statbuf;
+
+ /* note that we ignore any error here and below */
+ if (lstat(rm_path, &statbuf) < 0)
+ continue;
+
+ if (S_ISDIR(statbuf.st_mode))
+ {
+ RemovePgTempFilesInDir(rm_path, true);
+ rmdir(rm_path);
+ }
+ else
+ unlink(rm_path);
+ }
else
elog(LOG,
"unexpected file found in temporary-files directory: \"%s\"",
@@ -3152,6 +3412,23 @@ datadir_fsync_fname(const char *fname, bool isdir, int elevel)
fsync_fname_ext(fname, isdir, true, elevel);
}
+static void
+unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
+{
+ if (isdir)
+ {
+ if (rmdir(fname) != 0 && errno != ENOENT)
+ ereport(elevel,
+ (errcode_for_file_access(),
+ errmsg("could not rmdir directory \"%s\": %m", fname)));
+ }
+ else
+ {
+ /* Use PathNameDeleteTemporaryFile to report filesize */
+ PathNameDeleteTemporaryFile(fname, false);
+ }
+}
+
/*
* fsync_fname_ext -- Try to fsync a file or directory
*