Deduplicate files in local cache with or without exec rights
If we introduce an exact same object with execution rights as existing
file without execution right, we should not expect that the files
suddenly get execution rights. This breaks reproducibility and it is
easy to encounter. For example install an empty file with execution
rights. Or copy files from another artifact and `chmod +x` it.
diff --git a/src/buildstream/_cas/cascache.py b/src/buildstream/_cas/cascache.py
index 49a5e88..1008eef 100644
--- a/src/buildstream/_cas/cascache.py
+++ b/src/buildstream/_cas/cascache.py
@@ -81,9 +81,10 @@
#
class CASCache():
- def __init__(self, path):
+ def __init__(self, path, *, disable_exec=False):
self.casdir = os.path.join(path, 'cas')
self.tmpdir = os.path.join(path, 'tmp')
+ self._disable_exec = disable_exec
os.makedirs(os.path.join(self.casdir, 'refs', 'heads'), exist_ok=True)
os.makedirs(os.path.join(self.casdir, 'objects'), exist_ok=True)
os.makedirs(self.tmpdir, exist_ok=True)
@@ -136,7 +137,7 @@
# Optionally check presence of files
if with_files:
for filenode in directory.files:
- if not os.path.exists(self.objpath(filenode.digest)):
+ if not os.path.exists(self.objpath(filenode.digest, is_exec=filenode.is_executable)):
return False
# Check subdirectories
@@ -169,13 +170,9 @@
# regular file, create hardlink
fullpath = os.path.join(dest, filenode.name)
if can_link:
- utils.safe_link(self.objpath(filenode.digest), fullpath)
+ utils.safe_link(self.objpath(filenode.digest, is_exec=filenode.is_executable), fullpath)
else:
- utils.safe_copy(self.objpath(filenode.digest), fullpath)
-
- if filenode.is_executable:
- os.chmod(fullpath, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR |
- stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH)
+ utils.safe_copy(self.objpath(filenode.digest, is_exec=filenode.is_executable), fullpath)
for dirnode in directory.directories:
fullpath = os.path.join(dest, dirnode.name)
@@ -332,8 +329,12 @@
# Returns:
# (str): The path of the object
#
- def objpath(self, digest):
- return os.path.join(self.casdir, 'objects', digest.hash[:2], digest.hash[2:])
+ def objpath(self, digest, *, is_exec=False):
+ if is_exec and not self._disable_exec:
+ filename = '{}.exec'.format(digest.hash[2:])
+ else:
+ filename = digest.hash[2:]
+ return os.path.join(self.casdir, 'objects', digest.hash[:2], filename)
# add_object():
#
@@ -350,7 +351,7 @@
#
# Either `path` or `buffer` must be passed, but not both.
#
- def add_object(self, *, digest=None, path=None, buffer=None, link_directly=False):
+ def add_object(self, *, digest=None, path=None, buffer=None, link_directly=False, is_exec=False):
# Exactly one of the two parameters has to be specified
assert (path is None) != (buffer is None)
@@ -369,8 +370,7 @@
for chunk in iter(lambda: tmp.read(_BUFFER_SIZE), b""):
h.update(chunk)
else:
- tmp = stack.enter_context(self._temporary_object())
-
+ tmp = stack.enter_context(self._temporary_object(is_exec=is_exec))
if path:
with open(path, 'rb') as f:
for chunk in iter(lambda: f.read(_BUFFER_SIZE), b""):
@@ -386,7 +386,7 @@
digest.size_bytes = os.fstat(tmp.fileno()).st_size
# Place file at final location
- objpath = self.objpath(digest)
+ objpath = self.objpath(digest, is_exec=is_exec)
os.makedirs(os.path.dirname(objpath), exist_ok=True)
os.link(tmp.name, objpath)
@@ -592,19 +592,25 @@
#
def remote_missing_blobs(self, remote, blobs):
missing_blobs = dict()
+ executable = {}
+
# Limit size of FindMissingBlobs request
for required_blobs_group in _grouper(iter(blobs), 512):
request = remote_execution_pb2.FindMissingBlobsRequest(instance_name=remote.spec.instance_name)
- for required_digest in required_blobs_group:
+ for is_exec, required_digest in required_blobs_group:
d = request.blob_digests.add()
d.CopyFrom(required_digest)
+ if required_digest.hash not in executable:
+ executable[required_digest.hash] = set()
+ executable[required_digest.hash].add(is_exec)
response = remote.cas.FindMissingBlobs(request)
for missing_digest in response.missing_blob_digests:
d = remote_execution_pb2.Digest()
d.CopyFrom(missing_digest)
- missing_blobs[d.hash] = d
+ for is_exec in executable[missing_digest.hash]:
+ missing_blobs[(is_exec, d.hash)] = (is_exec, d)
return missing_blobs.values()
@@ -619,10 +625,10 @@
#
def local_missing_blobs(self, digests):
missing_blobs = []
- for digest in digests:
- objpath = self.objpath(digest)
+ for is_exec, digest in digests:
+ objpath = self.objpath(digest, is_exec=is_exec)
if not os.path.exists(objpath):
- missing_blobs.append(digest)
+ missing_blobs.append((is_exec, digest))
return missing_blobs
# required_blobs_for_directory():
@@ -636,7 +642,7 @@
# parse directory, and recursively add blobs
- yield directory_digest
+ yield False, directory_digest
directory = remote_execution_pb2.Directory()
@@ -644,7 +650,7 @@
directory.ParseFromString(f.read())
for filenode in directory.files:
- yield filenode.digest
+ yield filenode.is_executable, filenode.digest
for dirnode in directory.directories:
if dirnode.name not in excluded_subdirs:
@@ -797,9 +803,9 @@
for filenode in directory.files:
if update_mtime:
- os.utime(self.objpath(filenode.digest))
+ os.utime(self.objpath(filenode.digest, is_exec=filenode.is_executable))
if check_exists:
- if not os.path.exists(self.objpath(filenode.digest)):
+ if not os.path.exists(self.objpath(filenode.digest, is_exec=filenode.is_executable)):
raise FileNotFoundError
reachable.add(filenode.digest.hash)
@@ -813,10 +819,12 @@
#
# Create a named temporary file with 0o0644 access rights.
@contextlib.contextmanager
- def _temporary_object(self):
+ def _temporary_object(self, *, is_exec=False):
with utils._tempnamedfile(dir=self.tmpdir) as f:
- os.chmod(f.name,
- stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH)
+ access = stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH
+ if is_exec and not self._disable_exec:
+ access |= stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH
+ os.chmod(f.name, access)
yield f
# _ensure_blob():
@@ -830,27 +838,27 @@
# Returns:
# (str): The path of the object
#
- def _ensure_blob(self, remote, digest):
- objpath = self.objpath(digest)
+ def _ensure_blob(self, remote, digest, is_exec=False):
+ objpath = self.objpath(digest, is_exec=is_exec)
if os.path.exists(objpath):
# already in local repository
return objpath
- with self._temporary_object() as f:
+ with self._temporary_object(is_exec=is_exec) as f:
remote._fetch_blob(digest, f)
- added_digest = self.add_object(path=f.name, link_directly=True)
+ added_digest = self.add_object(path=f.name, link_directly=True, is_exec=is_exec)
assert added_digest.hash == digest.hash
return objpath
def _batch_download_complete(self, batch, *, missing_blobs=None):
- for digest, data in batch.send(missing_blobs=missing_blobs):
- with self._temporary_object() as f:
+ for digest, data, is_exec in batch.send(missing_blobs=missing_blobs):
+ with self._temporary_object(is_exec=is_exec) as f:
f.write(data)
f.flush()
- added_digest = self.add_object(path=f.name, link_directly=True)
+ added_digest = self.add_object(path=f.name, link_directly=True, is_exec=is_exec)
assert added_digest.hash == digest.hash
# Helper function for _fetch_directory().
@@ -864,8 +872,9 @@
return _CASBatchRead(remote)
# Helper function for _fetch_directory().
- def _fetch_directory_node(self, remote, digest, batch, fetch_queue, fetch_next_queue, *, recursive=False):
- in_local_cache = os.path.exists(self.objpath(digest))
+ def _fetch_directory_node(self, remote, digest, batch, fetch_queue, fetch_next_queue,
+ *, recursive=False, is_exec=False):
+ in_local_cache = os.path.exists(self.objpath(digest, is_exec=is_exec))
if in_local_cache:
# Skip download, already in local cache.
@@ -873,14 +882,14 @@
elif (digest.size_bytes >= remote.max_batch_total_size_bytes or
not remote.batch_read_supported):
# Too large for batch request, download in independent request.
- self._ensure_blob(remote, digest)
+ self._ensure_blob(remote, digest, is_exec=is_exec)
in_local_cache = True
else:
- if not batch.add(digest):
+ if not batch.add(digest, is_exec=is_exec):
# Not enough space left in batch request.
# Complete pending batch first.
batch = self._fetch_directory_batch(remote, batch, fetch_queue, fetch_next_queue)
- batch.add(digest)
+ batch.add(digest, is_exec=is_exec)
if recursive:
if in_local_cache:
@@ -963,25 +972,31 @@
batch = _CASBatchRead(remote)
- for digest in digests:
+ for d in digests:
+ try:
+ is_exec, digest = d
+ except TypeError:
+ digest = d
+ is_exec = False
+
if (digest.size_bytes >= remote.max_batch_total_size_bytes or
not remote.batch_read_supported):
# Too large for batch request, download in independent request.
try:
- self._ensure_blob(remote, digest)
+ self._ensure_blob(remote, digest, is_exec=is_exec)
except grpc.RpcError as e:
if e.code() == grpc.StatusCode.NOT_FOUND:
missing_blobs.append(digest)
else:
raise CASCacheError("Failed to fetch blob: {}".format(e)) from e
else:
- if not batch.add(digest):
+ if not batch.add(digest, is_exec=is_exec):
# Not enough space left in batch request.
# Complete pending batch first.
self._batch_download_complete(batch, missing_blobs=missing_blobs)
batch = _CASBatchRead(remote)
- batch.add(digest)
+ batch.add(digest, is_exec=is_exec)
# Complete last pending batch
self._batch_download_complete(batch, missing_blobs=missing_blobs)
@@ -999,8 +1014,14 @@
def send_blobs(self, remote, digests, u_uid=uuid.uuid4()):
batch = _CASBatchUpdate(remote)
- for digest in digests:
- with open(self.objpath(digest), 'rb') as f:
+ for d in digests:
+ try:
+ is_exec, digest = d
+ except TypeError:
+ digest = d
+ is_exec = False
+
+ with open(self.objpath(digest, is_exec=is_exec), 'rb') as f:
assert os.fstat(f.fileno()).st_size == digest.size_bytes
if (digest.size_bytes >= remote.max_batch_total_size_bytes or
diff --git a/src/buildstream/_cas/casremote.py b/src/buildstream/_cas/casremote.py
index cd46e9c..b461d4e 100644
--- a/src/buildstream/_cas/casremote.py
+++ b/src/buildstream/_cas/casremote.py
@@ -310,8 +310,9 @@
self._request.instance_name = remote.instance_name
self._size = 0
self._sent = False
+ self._is_exec = {}
- def add(self, digest):
+ def add(self, digest, *, is_exec=False):
assert not self._sent
new_batch_size = self._size + digest.size_bytes
@@ -323,6 +324,9 @@
request_digest.hash = digest.hash
request_digest.size_bytes = digest.size_bytes
self._size = new_batch_size
+ if digest.hash not in self._is_exec:
+ self._is_exec[digest.hash] = set()
+ self._is_exec[digest.hash].add(is_exec)
return True
def send(self, *, missing_blobs=None):
@@ -349,7 +353,8 @@
raise CASRemoteError("Failed to download blob {}: expected {} bytes, received {} bytes".format(
response.digest.hash, response.digest.size_bytes, len(response.data)))
- yield (response.digest, response.data)
+ for is_exec in self._is_exec[response.digest.hash]:
+ yield (response.digest, response.data, is_exec)
# Represents a batch of blobs queued for upload.
diff --git a/src/buildstream/_cas/casserver.py b/src/buildstream/_cas/casserver.py
index 9606c26..582ac2a 100644
--- a/src/buildstream/_cas/casserver.py
+++ b/src/buildstream/_cas/casserver.py
@@ -64,7 +64,7 @@
def create_server(repo, *, enable_push,
max_head_size=int(10e9),
min_head_size=int(2e9)):
- cas = CASCache(os.path.abspath(repo))
+ cas = CASCache(os.path.abspath(repo), disable_exec=True)
artifactdir = os.path.join(os.path.abspath(repo), 'artifacts', 'refs')
sourcedir = os.path.join(os.path.abspath(repo), 'source_protos')
diff --git a/src/buildstream/storage/_casbaseddirectory.py b/src/buildstream/storage/_casbaseddirectory.py
index 2c5d751..b040755 100644
--- a/src/buildstream/storage/_casbaseddirectory.py
+++ b/src/buildstream/storage/_casbaseddirectory.py
@@ -136,8 +136,8 @@
entry = IndexEntry(filename, _FileType.REGULAR_FILE,
modified=modified or filename in self.index)
path = os.path.join(basename, filename)
- entry.digest = self.cas_cache.add_object(path=path, link_directly=can_link)
entry.is_executable = os.access(path, os.X_OK)
+ entry.digest = self.cas_cache.add_object(path=path, link_directly=can_link, is_exec=entry.is_executable)
self.index[filename] = entry
self.__invalidate_digest()
diff --git a/src/buildstream/storage/_filebaseddirectory.py b/src/buildstream/storage/_filebaseddirectory.py
index 8c55819..4f93737 100644
--- a/src/buildstream/storage/_filebaseddirectory.py
+++ b/src/buildstream/storage/_filebaseddirectory.py
@@ -272,11 +272,8 @@
continue
if entry.type == _FileType.REGULAR_FILE:
- src_path = source_directory.cas_cache.objpath(entry.digest)
+ src_path = source_directory.cas_cache.objpath(entry.digest, is_exec=entry.is_executable)
actionfunc(src_path, dest_path, result=result)
- if entry.is_executable:
- os.chmod(dest_path, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR |
- stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH)
else:
assert entry.type == _FileType.SYMLINK
os.symlink(entry.target, dest_path)
diff --git a/tests/frontend/buildcheckout.py b/tests/frontend/buildcheckout.py
index 97bce91..ccd284c 100644
--- a/tests/frontend/buildcheckout.py
+++ b/tests/frontend/buildcheckout.py
@@ -4,6 +4,8 @@
import os
import tarfile
import hashlib
+import shutil
+import stat
import subprocess
import re
@@ -885,3 +887,34 @@
checkout_dir])
res.assert_main_error(ErrorDomain.STREAM, 'uncached-checkout-attempt')
assert re.findall(r'Remote \((\S+)\) does not have artifact (\S+) cached', res.stderr)
+
+
+@pytest.mark.datafiles(DATA_DIR)
+def test_access_rights(datafiles, cli):
+ project = str(datafiles)
+ checkout = os.path.join(cli.directory, 'checkout')
+
+ shutil.copyfile(os.path.join(project, 'files', 'bin-files', 'usr', 'bin', 'hello'),
+ os.path.join(project, 'files', 'bin-files', 'usr', 'bin', 'hello-2'))
+ os.chmod(os.path.join(project, 'files', 'bin-files', 'usr', 'bin', 'hello'),
+ 0o0755)
+ os.chmod(os.path.join(project, 'files', 'bin-files', 'usr', 'bin', 'hello-2'),
+ 0o0644)
+
+ result = cli.run(project=project, args=['build', 'target.bst'])
+ result.assert_success()
+
+ checkout_args = ['artifact', 'checkout', 'target.bst',
+ '--directory', checkout]
+
+ # Now check it out
+ result = cli.run(project=project, args=checkout_args)
+ result.assert_success()
+
+ st = os.lstat(os.path.join(checkout, 'usr', 'bin', 'hello'))
+ assert stat.S_ISREG(st.st_mode)
+ assert stat.S_IMODE(st.st_mode) == 0o0755
+
+ st = os.lstat(os.path.join(checkout, 'usr', 'bin', 'hello-2'))
+ assert stat.S_ISREG(st.st_mode)
+ assert stat.S_IMODE(st.st_mode) == 0o0644
diff --git a/tests/integration/shell.py b/tests/integration/shell.py
index f7de3e4..b55eb7b 100644
--- a/tests/integration/shell.py
+++ b/tests/integration/shell.py
@@ -430,7 +430,7 @@
# Remove the binary from the CAS
cachedir = cli.config['cachedir']
- objpath = os.path.join(cachedir, 'cas', 'objects', digest[:2], digest[2:])
+ objpath = os.path.join(cachedir, 'cas', 'objects', digest[:2], '{}.exec'.format(digest[2:]))
os.unlink(objpath)
# check shell doesn't work
diff --git a/tests/testutils/artifactshare.py b/tests/testutils/artifactshare.py
index a5522c8..965a4a9 100644
--- a/tests/testutils/artifactshare.py
+++ b/tests/testutils/artifactshare.py
@@ -46,7 +46,7 @@
self.artifactdir = os.path.join(self.repodir, 'artifacts', 'refs')
os.makedirs(self.artifactdir)
- self.cas = CASCache(self.repodir)
+ self.cas = CASCache(self.repodir, disable_exec=True)
self.total_space = total_space
self.free_space = free_space