blob: adeceb9e3587f6ca6be3293f3384f9e1b08918a3 [file] [log] [blame]
/*
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package parser
import (
"fmt"
"net/url"
"os"
"os/exec"
"path"
"strings"
"time"
"github.com/apache/incubator-devlake/core/errors"
"github.com/apache/incubator-devlake/core/log"
"github.com/apache/incubator-devlake/core/plugin"
"github.com/apache/incubator-devlake/helpers/pluginhelper/api"
giturls "github.com/chainguard-dev/git-urls"
)
var _ RepoCloner = (*GitcliCloner)(nil)
var ErrNoData = errors.NotModified.New("No data to be collected")
// CloneRepoConfig is the configuration for the CloneRepo method
// the subtask should run in Full Sync mode whenever the configuration is changed
type CloneRepoConfig struct {
UseGoGit *bool
SkipCommitStat *bool
SkipCommitFiles *bool
NoShallowClone bool
}
type GitcliCloner struct {
ctx plugin.SubTaskContext
taskData *GitExtractorTaskData
logger log.Logger
stateManager *api.SubtaskStateManager
since *time.Time
remoteUrl string
localDir string
success bool
syncEnvs []string
syncArgs []string
}
func NewGitcliCloner(ctx plugin.SubTaskContext, localDir string) (*GitcliCloner, errors.Error) {
taskData := ctx.GetData().(*GitExtractorTaskData)
stateManager := errors.Must1(api.NewSubtaskStateManager(&api.SubtaskCommonArgs{
SubTaskContext: ctx,
Params: taskData.Options.GitExtractorApiParams,
SubtaskConfig: CloneRepoConfig{
UseGoGit: taskData.Options.UseGoGit,
SkipCommitStat: taskData.Options.SkipCommitStat,
SkipCommitFiles: taskData.Options.SkipCommitFiles,
NoShallowClone: taskData.Options.NoShallowClone,
},
}))
cloner := &GitcliCloner{
ctx: ctx,
taskData: taskData,
logger: ctx.GetLogger().Nested("gitcli"),
stateManager: stateManager,
since: stateManager.GetSince(),
remoteUrl: taskData.Options.Url,
localDir: localDir,
success: false,
}
return cloner, cloner.prepareSync()
}
func (g *GitcliCloner) prepareSync() errors.Error {
taskData := g.taskData
if *taskData.Options.SkipCommitStat {
g.syncArgs = append(g.syncArgs, "--filter=blob:none")
}
remoteUrl, e := giturls.Parse(g.remoteUrl)
if e != nil {
return errors.Convert(e)
}
// support proxy
if remoteUrl.Scheme == "http" || remoteUrl.Scheme == "https" {
if taskData.Options.Proxy != "" {
g.syncEnvs = append(g.syncEnvs, fmt.Sprintf("HTTPS_PROXY=%s", taskData.Options.Proxy))
}
if remoteUrl.Scheme == "https" && g.ctx.GetConfigReader().GetBool("IN_SECURE_SKIP_VERIFY") {
g.syncEnvs = append(g.syncEnvs, "GIT_SSL_NO_VERIFY=true")
}
} else if remoteUrl.Scheme == "ssh" {
var sshCmdArgs []string
if taskData.Options.Proxy != "" {
parsedProxyURL, e := url.Parse(taskData.Options.Proxy)
if e != nil {
return errors.BadInput.Wrap(e, "failed to parse the proxy URL")
}
proxyCommand := "corkscrew"
sshCmdArgs = append(sshCmdArgs, "-o", fmt.Sprintf(`ProxyCommand="%s %s %s %%h %%p"`, proxyCommand, parsedProxyURL.Hostname(), parsedProxyURL.Port()))
}
// support private key
if taskData.Options.PrivateKey != "" {
pkFile, err := os.CreateTemp("", "gitext-pk")
if err != nil {
g.logger.Error(err, "create temp private key file error")
return errors.Default.New("failed to handle the private key")
}
if _, e := pkFile.WriteString(taskData.Options.PrivateKey + "\n"); e != nil {
g.logger.Error(err, "write private key file error")
return errors.Default.New("failed to write the private key")
}
pkFile.Close()
if e := os.Chmod(pkFile.Name(), 0600); e != nil {
g.logger.Error(err, "chmod private key file error")
return errors.Default.New("failed to modify the private key")
}
if taskData.Options.Passphrase != "" {
pp := exec.CommandContext(
g.ctx.GetContext(),
"ssh-keygen", "-p",
"-P", taskData.Options.Passphrase,
"-N", "",
"-f", pkFile.Name(),
)
if ppout, pperr := pp.CombinedOutput(); pperr != nil {
g.logger.Error(pperr, "change private key passphrase error")
g.logger.Info(string(ppout))
return errors.Default.New("failed to decrypt the private key")
}
}
defer os.Remove(pkFile.Name())
sshCmdArgs = append(sshCmdArgs, fmt.Sprintf("-i %s -o StrictHostKeyChecking=no", pkFile.Name()))
}
if len(sshCmdArgs) > 0 {
g.syncEnvs = append(g.syncEnvs, fmt.Sprintf("GIT_SSH_COMMAND=ssh %s", strings.Join(sshCmdArgs, " ")))
}
}
return nil
}
func (g *GitcliCloner) IsIncremental() bool {
if g != nil && g.stateManager != nil {
if g.stateManager.GetSince() != nil {
return true
}
return g.stateManager.IsIncremental()
}
return false
}
func (g *GitcliCloner) CloneRepo() errors.Error {
if g.since == nil {
// full sync
if err := g.fullClone(); err != nil {
return err
}
} else {
if g.taskData.Options.NoShallowClone {
// data source does not support shallow clone
// 1. perform a full clone to accommodate
// 2. perform a local shallow clone to reduce the libgit2 memory usage
if err := g.doubleClone(); err != nil {
return err
}
} else {
// data source support shallow clone
if err := g.shallowClone(); err != nil {
return err
}
}
if err := g.deepen(); err != nil {
return err
}
g.success = true
}
return nil
}
func (g *GitcliCloner) CloseRepo() errors.Error {
if g.success {
g.logger.Info("save state")
return g.stateManager.Close()
}
return nil
}
func (g *GitcliCloner) fullClone() errors.Error {
return g.gitClone(g.remoteUrl, g.localDir, "--bare")
}
func (g *GitcliCloner) deepen() errors.Error {
// deepen the commits by 1 more step to avoid https://github.com/apache/incubator-devlake/issues/7426
// fixes error described on https://stackoverflow.com/questions/63878612/git-fatal-error-in-object-unshallow-sha-1
// It might be caused by the commit which being deepen has multiple parent(e.g. a merge commit), not sure.
if err := g.gitCmd("repack", "-d"); err != nil {
return errors.Default.Wrap(err, "failed to repack the repo")
}
// deepen would fail on a EMPTY repo, ignore the error
if err := g.gitFetch("--deepen=1"); err != nil {
g.logger.Error(err, "failed to deepen the cloned repo")
}
return nil
}
func (g *GitcliCloner) shallowClone() errors.Error {
// to fetch newly added commits from ALL branches, we need to the following guide:
// https://stackoverflow.com/questions/23708231/git-shallow-clone-clone-depth-misses-remote-branches
// 1. clone the repo with depth 1
if err := g.gitClone(g.remoteUrl, g.localDir, "--depth=1", "--bare"); err != nil {
return err
}
// 2. configure to fetch all branches from the remote server, so we can collect new commits from them
gitConfig, err := os.OpenFile(path.Join(g.localDir, "config"), os.O_APPEND|os.O_WRONLY, 0644)
if err != nil {
return errors.Default.Wrap(err, "failed to open git config file")
}
_, err = gitConfig.WriteString("\tfetch = +refs/heads/*:refs/remotes/origin/*\n")
if err != nil {
return errors.Default.Wrap(err, "failed to write to git config file")
}
g.logger.Debug("updated git config to fetch all remote branches")
// 3. fetch all branches with depth=1 so the next step would collect fewer commits
// (I don't know why, but it reduced total number of commits from 18k to 7k on https://gitlab.com/gitlab-org/gitlab-foss.git with the same parameters)
if err := g.gitFetch("--depth=1", "origin"); err != nil {
return errors.Default.Wrap(err, "failed to fetch all branches from the remote server")
}
// 4. fetch all new commits from all branches since the given time
if err := g.gitFetch(fmt.Sprintf("--shallow-since=%s", g.since.Format(time.RFC3339))); err != nil {
g.logger.Warn(err, "shallow fetch failed")
}
return nil
}
func (g *GitcliCloner) doubleClone() errors.Error {
intermediaryDir, e := os.MkdirTemp("", "gitextint")
if e != nil {
return errors.Convert(e)
}
// step 1: full clone into a intermediary dir
backup := g.localDir
g.localDir = intermediaryDir
if err := g.fullClone(); err != nil {
return err
}
g.localDir = backup
// step 2: perform shallow clone against the intermediary dir
backup = g.remoteUrl
g.remoteUrl = fmt.Sprintf("file://%s", intermediaryDir) // the file:// prefix is required for shallow clone to work
if err := g.shallowClone(); err != nil {
return err
}
g.remoteUrl = backup
return nil
}
func (g *GitcliCloner) gitClone(args ...string) errors.Error {
args = append(args, g.syncArgs...)
return g.git(g.syncEnvs, "", "clone", args...)
}
func (g *GitcliCloner) gitFetch(args ...string) errors.Error {
empty, err := g.repoIsEmpty(args...)
if err != nil {
g.logger.Error(err, "repo is empty")
return err
}
if empty {
g.logger.Info("repo is empty, doesn't need to fetch")
return nil
}
args = append(args, g.syncArgs...)
return g.git(g.syncEnvs, g.localDir, "fetch", args...)
}
func (g *GitcliCloner) repoIsEmpty(args ...string) (bool, errors.Error) {
// try to run command: git log
// if repo is empty, it will return an error
err := g.git(g.syncEnvs, g.localDir, "log")
if err != nil {
g.logger.Warn(err, "git log failed")
return true, nil
}
return false, nil
}
func (g *GitcliCloner) gitCmd(gitcmd string, args ...string) errors.Error {
return g.git(nil, g.localDir, gitcmd, args...)
}
func (g *GitcliCloner) git(env []string, dir string, gitcmd string, args ...string) errors.Error {
g.logger.Debug("git %s %v", gitcmd, args)
args = append([]string{gitcmd}, args...)
cmd := exec.CommandContext(g.ctx.GetContext(), "git", args...)
cmd.Env = env
cmd.Dir = dir
return g.execCommand(cmd)
}
func (g *GitcliCloner) execCommand(cmd *exec.Cmd) errors.Error {
output, err := cmd.CombinedOutput()
if err != nil {
g.logger.Debug("err: %v, output: %s", err, string(output))
outputString := string(output)
if strings.Contains(outputString, "fatal: error processing shallow info: 4") ||
strings.Contains(outputString, "fatal: the remote end hung up unexpectedly") {
return ErrNoData
}
return errors.Default.New(fmt.Sprintf("git cmd %v in %s failed: %s", sanitizeArgs(cmd.Args), cmd.Dir, generateErrMsg(output, err)))
}
return nil
}
func generateErrMsg(output []byte, err error) string {
errMsg := strings.TrimSpace(string(output))
if errMsg == "" {
errMsg = err.Error()
}
if errMsg == "" {
errMsg = "unknown error"
}
return errMsg
}
func sanitizeArgs(args []string) []string {
var ret []string
for _, arg := range args {
u, err := url.Parse(arg)
if err == nil && u != nil && u.User != nil {
password, ok := u.User.Password()
if ok {
arg = strings.Replace(arg, password, strings.Repeat("*", len(password)), -1)
}
}
ret = append(ret, arg)
}
return ret
}