blob: ba18fd3e5ae86cd0996fb89ebc3a90b6785e70d7 [file]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.codec.digest;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.security.MessageDigest;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.TreeSet;
/**
* Operations for computing Git object identifiers and their generalizations described by the
* <a href="https://www.swhid.org/swhid-specification/">SWHID specification</a>.
*
* <p>When the hash algorithm is SHA-1, the identifiers produced by this class are identical to those used by Git.
* Other hash algorithms produce generalized identifiers as described by the SWHID specification.</p>
*
* <p>This class is immutable and thread-safe. However, the {@link MessageDigest} instances passed to it generally won't be.</p>
*
* @see <a href="https://git-scm.com/book/en/v2/Git-Internals-Git-Objects">Git Internals – Git Objects</a>
* @see <a href="https://www.swhid.org/swhid-specification/">SWHID Specification</a>
* @since 1.22.0
*/
public class GitIdentifiers {
/**
* The type of a Git tree entry, which maps to a Unix file-mode string.
*
* <p>Git encodes the file type and permission bits as an ASCII octal string that precedes the entry name in the binary tree format. The values defined here
* cover the four entry types that Git itself produces.</p>
*/
public enum FileMode {
/**
* A sub-directory (Git sub-tree).
*/
DIRECTORY("40000"),
/**
* An executable file.
*/
EXECUTABLE("100755"),
/**
* A regular (non-executable) file.
*/
REGULAR("100644"),
/**
* A symbolic link.
*/
SYMBOLIC_LINK("120000");
/**
* The octal mode as used by Git.
*/
private final String mode;
/**
* Serialized {@code mode}: since this is mutable, it must remain private.
*/
private final byte[] modeBytes;
FileMode(final String mode) {
this.mode = mode;
this.modeBytes = mode.getBytes(StandardCharsets.US_ASCII);
}
/**
* Gets the octal mode as used by Git.
*
* @return The octal mode
*/
public String getMode() {
return mode;
}
}
/**
* Represents a single entry in a Git tree object.
*
* <p>A Git tree object encodes a directory snapshot. Each entry holds:</p>
* <ul>
* <li>a {@link FileMode} that determines the Unix file mode (e.g. {@code 100644} for a regular file),</li>
* <li>the entry name (file or directory name, without a path separator),</li>
* <li>the raw object id of the referenced blob or sub-tree.</li>
* </ul>
*
* <p>Entries are ordered by {@link #compareTo} using Git's tree-sort rule: directory names are compared as if they ended with {@code '/'}, so that {@code foo/}
* sorts after {@code foobar}.</p>
*
* @see <a href="https://git-scm.com/book/en/v2/Git-Internals-Git-Objects">Git Internals – Git Objects</a>
* @see <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#53-directories">SWHID Directory Identifier</a>
*/
static class DirectoryEntry implements Comparable<DirectoryEntry> {
/**
* The entry name (file or directory name, no path separator).
*/
private final String name;
/**
* The raw object id of the referenced blob or sub-tree.
*/
private final byte[] rawObjectId;
/**
* The key used for ordering entries within a tree object.
*
* <p>>Git appends {@code '/'} to directory names before comparing.</p>
*/
private final String sortKey;
/**
* The Git object type, which determines the Unix file-mode prefix.
*/
private final FileMode type;
/**
* Creates an entry.
*
* @param name The name of the entry
* @param type The type of the entry
* @param rawObjectId The id of the entry
*/
DirectoryEntry(final String name, final FileMode type, final byte[] rawObjectId) {
if (Objects.requireNonNull(name).indexOf('/') >= 0) {
throw new IllegalArgumentException("Entry name must not contain '/': " + name);
}
this.name = name;
this.type = Objects.requireNonNull(type);
this.sortKey = type == FileMode.DIRECTORY ? name + "/" : name;
this.rawObjectId = Objects.requireNonNull(rawObjectId);
}
@Override
public int compareTo(final DirectoryEntry o) {
return sortKey.compareTo(o.sortKey);
}
@Override
public boolean equals(final Object obj) {
if (obj == this) {
return true;
}
if (!(obj instanceof DirectoryEntry)) {
return false;
}
final DirectoryEntry other = (DirectoryEntry) obj;
return name.equals(other.name);
}
@Override
public int hashCode() {
return name.hashCode();
}
}
/**
* Builds a Git tree identifier for a virtual directory structure, such as the contents of
* an archive.
*/
public static final class TreeIdBuilder {
/**
* A supplier of a blob identifier that may throw {@link IOException}.
*/
@FunctionalInterface
private interface BlobIdSupplier {
byte[] get() throws IOException;
}
private static void checkPathComponent(String name) {
if (".".equals(name) || "..".equals(name)) {
throw new IllegalArgumentException("Path component not allowed: " + name);
}
}
private final Map<String, TreeIdBuilder> dirEntries = new HashMap<>();
private final Map<String, DirectoryEntry> fileEntries = new HashMap<>();
private final MessageDigest messageDigest;
private TreeIdBuilder(final MessageDigest messageDigest) {
this.messageDigest = Objects.requireNonNull(messageDigest);
}
/**
* Returns the {@link TreeIdBuilder} for the named subdirectory, creating it if absent.
*
* @param name The relative path of the subdirectory in normalized form (may contain {@code '/'}).
* @return The {@link TreeIdBuilder} for the subdirectory.
* @throws IllegalArgumentException If any path component is {@code "."} or {@code ".."}.
*/
public TreeIdBuilder addDirectory(final String name) {
TreeIdBuilder current = this;
for (final String component : name.split("/", -1)) {
if (component.isEmpty()) {
continue;
}
checkPathComponent(component);
current = current.dirEntries.computeIfAbsent(component, k -> new TreeIdBuilder(messageDigest));
}
return current;
}
/**
* Adds a file entry at the given path within this tree.
*
* <p>If {@code name} contains {@code '/'}, intermediate subdirectories are created automatically.</p>
*
* <p>The stream is eagerly drained.</p>
*
* <p>If the size of the stream is known in advance, consider using {@link TreeIdBuilder#addFile(FileMode, String, long, InputStream)} instead.</p>
*
* @param mode The file mode (e.g. {@link FileMode#REGULAR}).
* @param name The relative path of the entry in normalized form(may contain {@code '/'}).
* @param data The file content.
* @throws IOException If the stream cannot be read.
* @throws IllegalArgumentException If any path component is {@code "."} or {@code ".."}.
*/
public void addFile(final FileMode mode, final String name, final InputStream data) throws IOException {
addFile(mode, name, () -> blobId(messageDigest, readAllBytes(data)));
}
/**
* Adds a file entry at the given path within this tree, streaming content without buffering.
*
* <p>If {@code name} contains {@code '/'}, intermediate subdirectories are created automatically.</p>
*
* <p>The stream is eagerly drained.</p>
*
* @param mode The file mode (e.g. {@link FileMode#REGULAR}).
* @param name The relative path of the entry in normalized form(may contain {@code '/'}).
* @param dataSize The exact number of bytes in {@code data}.
* @param data The file content.
* @throws IOException If the stream cannot be read.
* @throws IllegalArgumentException If any path component is {@code "."} or {@code ".."}.
*/
public void addFile(final FileMode mode, final String name, final long dataSize, final InputStream data) throws IOException {
addFile(mode, name, () -> blobId(messageDigest, dataSize, data));
}
private void addFile(final FileMode mode, final String name, final BlobIdSupplier blobId) throws IOException {
final int slash = name.indexOf('/');
if (slash < 0) {
checkPathComponent(name);
fileEntries.put(name, new DirectoryEntry(name, mode, blobId.get()));
} else {
addDirectory(name.substring(0, slash)).addFile(mode, name.substring(slash + 1), blobId);
}
}
/**
* Adds a file entry at the given path within this tree.
*
* <p>If {@code name} contains {@code '/'}, intermediate subdirectories are created automatically.</p>
*
* @param mode The file mode (e.g. {@link FileMode#REGULAR}).
* @param name The relative path of the entry in normalized form(may contain {@code '/'}).
* @param data The file content.
* @throws IOException If an I/O error occurs.
* @throws IllegalArgumentException If any path component is {@code "."} or {@code ".."}.
*/
public void addFile(final FileMode mode, final String name, final byte[] data) throws IOException {
addFile(mode, name, () -> blobId(messageDigest, data));
}
/**
* Computes the Git tree identifier for this directory and all its descendants.
*
* @return The raw tree identifier bytes.
* @throws IOException If a digest operation fails.
*/
public byte[] build() throws IOException {
final Set<DirectoryEntry> entries = new TreeSet<>(fileEntries.values());
for (final Map.Entry<String, TreeIdBuilder> e : dirEntries.entrySet()) {
entries.add(new DirectoryEntry(e.getKey(), FileMode.DIRECTORY, e.getValue().build()));
}
final ByteArrayOutputStream baos = new ByteArrayOutputStream();
for (final DirectoryEntry entry : entries) {
baos.write(entry.type.modeBytes);
baos.write(' ');
baos.write(entry.name.getBytes(StandardCharsets.UTF_8));
baos.write('\0');
baos.write(entry.rawObjectId);
}
messageDigest.reset();
DigestUtils.updateDigest(messageDigest, getGitTreePrefix(baos.size()));
return DigestUtils.updateDigest(messageDigest, baos.toByteArray()).digest();
}
}
/**
* Reads through a byte array and returns a generalized Git blob identifier.
*
* <p>The identifier is computed in the way described by the
* <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#52-contents">SWHID contents identifier</a>, but it can use any hash
* algorithm.</p>
*
* <p>When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.</p>
*
* @param messageDigest The MessageDigest to use (for example SHA-1).
* @param data Data to digest.
* @return A generalized Git blob identifier.
*/
public static byte[] blobId(final MessageDigest messageDigest, final byte[] data) {
messageDigest.reset();
DigestUtils.updateDigest(messageDigest, getGitBlobPrefix(data.length));
return DigestUtils.digest(messageDigest, data);
}
/**
* Reads through a stream and returns a generalized Git blob identifier.
*
* <p>The stream is drained and its contents are buffered to determine the size before hashing. To avoid
* buffering, use {@link #blobId(MessageDigest, long, InputStream)} when the size is known in advance.</p>
*
* <p>When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.</p>
*
* @param messageDigest The MessageDigest to use (for example SHA-1).
* @param data Stream to digest.
* @return A generalized Git blob identifier.
* @throws IOException On error reading the stream.
*/
public static byte[] blobId(final MessageDigest messageDigest, final InputStream data) throws IOException {
return blobId(messageDigest, readAllBytes(data));
}
/**
* Reads through a stream of known size and returns a generalized Git blob identifier, without buffering.
*
* <p>When the size of the content is known in advance, this overload streams {@code data} directly through
* the digest without buffering the full content in memory.</p>
*
* <p>When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.</p>
*
* @param messageDigest The MessageDigest to use (for example SHA-1).
* @param dataSize The exact number of bytes in {@code data}.
* @param data Stream to digest.
* @return A generalized Git blob identifier.
* @throws IOException On error reading the stream.
*/
public static byte[] blobId(final MessageDigest messageDigest, final long dataSize, final InputStream data) throws IOException {
messageDigest.reset();
DigestUtils.updateDigest(messageDigest, getGitBlobPrefix(dataSize));
return DigestUtils.updateDigest(messageDigest, data).digest();
}
/**
* Reads through a file and returns a generalized Git blob identifier.
*
* <p>The identifier is computed in the way described by the
* <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#52-contents">SWHID contents identifier</a>, but it can use any hash
* algorithm.</p>
*
* <p>When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.</p>
*
* @param messageDigest The MessageDigest to use (for example SHA-1).
* @param data Path to the file to digest.
* @return A generalized Git blob identifier.
* @throws IOException On error accessing the file.
*/
public static byte[] blobId(final MessageDigest messageDigest, final Path data) throws IOException {
messageDigest.reset();
if (Files.isSymbolicLink(data)) {
final byte[] linkTarget = Files.readSymbolicLink(data).toString().getBytes(StandardCharsets.UTF_8);
DigestUtils.updateDigest(messageDigest, getGitBlobPrefix(linkTarget.length));
return DigestUtils.digest(messageDigest, linkTarget);
}
DigestUtils.updateDigest(messageDigest, getGitBlobPrefix(Files.size(data)));
return DigestUtils.updateDigest(messageDigest, data).digest();
}
private static FileMode getGitDirectoryEntryType(final Path path) {
// Symbolic links first
if (Files.isSymbolicLink(path)) {
return FileMode.SYMBOLIC_LINK;
}
if (Files.isDirectory(path)) {
return FileMode.DIRECTORY;
}
if (Files.isExecutable(path)) {
return FileMode.EXECUTABLE;
}
return FileMode.REGULAR;
}
private static byte[] getGitBlobPrefix(final long dataSize) {
return getGitPrefix("blob", dataSize);
}
private static byte[] getGitPrefix(final String type, final long dataSize) {
return (type + " " + dataSize + "\0").getBytes(StandardCharsets.UTF_8);
}
private static byte[] getGitTreePrefix(final long dataSize) {
return getGitPrefix("tree", dataSize);
}
private static void populateFromPath(final TreeIdBuilder builder, final Path directory) throws IOException {
try (DirectoryStream<Path> files = Files.newDirectoryStream(directory)) {
for (final Path path : files) {
final String name = Objects.toString(path.getFileName());
final FileMode mode = getGitDirectoryEntryType(path);
switch (mode) {
case DIRECTORY:
populateFromPath(builder.addDirectory(name), path);
break;
case SYMBOLIC_LINK:
final byte[] linkTarget = Files.readSymbolicLink(path).toString().getBytes(StandardCharsets.UTF_8);
builder.addFile(FileMode.SYMBOLIC_LINK, name, linkTarget);
break;
default:
try (InputStream is = Files.newInputStream(path)) {
builder.addFile(mode, name, Files.size(path), is);
}
break;
}
}
}
}
private static byte[] readAllBytes(final InputStream in) throws IOException {
final ByteArrayOutputStream out = new ByteArrayOutputStream();
final byte[] buf = new byte[DigestUtils.BUFFER_SIZE];
int n;
while ((n = in.read(buf)) != -1) {
out.write(buf, 0, n);
}
return out.toByteArray();
}
/**
* Reads through a directory and returns a generalized Git tree identifier.
*
* <p>The identifier is computed in the way described by the
* <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#53-directories">SWHID directory identifier</a>, but it can use any hash
* algorithm.</p>
*
* <p>When the hash algorithm is SHA-1, the identifier is identical to Git tree identifier and SWHID directory identifier.</p>
*
* @param messageDigest The MessageDigest to use (for example SHA-1).
* @param data Path to the directory to digest.
* @return A generalized Git tree identifier.
* @throws IOException On error accessing the directory or its contents.
*/
public static byte[] treeId(final MessageDigest messageDigest, final Path data) throws IOException {
final TreeIdBuilder builder = treeIdBuilder(messageDigest);
populateFromPath(builder, data);
return builder.build();
}
/**
* Returns a new {@link TreeIdBuilder} for constructing a generalized Git tree identifier from a virtual directory
* structure, such as the contents of an archive.
*
* <p>The identifier is computed in the way described by the
* <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#53-directories">SWHID directory identifier</a>, but it can use any hash
* algorithm.</p>
*
* <p>When the hash algorithm is SHA-1, the identifier is identical to Git tree identifier and SWHID directory identifier.</p>
*
* @param messageDigest The MessageDigest to use (for example SHA-1).
* @return A new {@link TreeIdBuilder}.
*/
public static TreeIdBuilder treeIdBuilder(final MessageDigest messageDigest) {
return new TreeIdBuilder(messageDigest);
}
private GitIdentifiers() {
// utility class
}
}