| /* |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| */ |
| |
| package parser |
| |
| import ( |
| "fmt" |
| "net/url" |
| "os" |
| "os/exec" |
| "path" |
| "strings" |
| "time" |
| |
| "github.com/apache/incubator-devlake/core/errors" |
| "github.com/apache/incubator-devlake/core/log" |
| "github.com/apache/incubator-devlake/core/plugin" |
| "github.com/apache/incubator-devlake/helpers/pluginhelper/api" |
| giturls "github.com/chainguard-dev/git-urls" |
| ) |
| |
| var _ RepoCloner = (*GitcliCloner)(nil) |
| var ErrNoData = errors.NotModified.New("No data to be collected") |
| |
| // CloneRepoConfig is the configuration for the CloneRepo method |
| // the subtask should run in Full Sync mode whenever the configuration is changed |
| type CloneRepoConfig struct { |
| UseGoGit *bool |
| SkipCommitStat *bool |
| SkipCommitFiles *bool |
| NoShallowClone bool |
| } |
| |
| type GitcliCloner struct { |
| ctx plugin.SubTaskContext |
| taskData *GitExtractorTaskData |
| logger log.Logger |
| stateManager *api.SubtaskStateManager |
| since *time.Time |
| remoteUrl string |
| localDir string |
| success bool |
| syncEnvs []string |
| syncArgs []string |
| } |
| |
| func NewGitcliCloner(ctx plugin.SubTaskContext, localDir string) (*GitcliCloner, errors.Error) { |
| taskData := ctx.GetData().(*GitExtractorTaskData) |
| stateManager := errors.Must1(api.NewSubtaskStateManager(&api.SubtaskCommonArgs{ |
| SubTaskContext: ctx, |
| Params: taskData.Options.GitExtractorApiParams, |
| SubtaskConfig: CloneRepoConfig{ |
| UseGoGit: taskData.Options.UseGoGit, |
| SkipCommitStat: taskData.Options.SkipCommitStat, |
| SkipCommitFiles: taskData.Options.SkipCommitFiles, |
| NoShallowClone: taskData.Options.NoShallowClone, |
| }, |
| })) |
| |
| cloner := &GitcliCloner{ |
| ctx: ctx, |
| taskData: taskData, |
| logger: ctx.GetLogger().Nested("gitcli"), |
| stateManager: stateManager, |
| since: stateManager.GetSince(), |
| remoteUrl: taskData.Options.Url, |
| localDir: localDir, |
| success: false, |
| } |
| return cloner, cloner.prepareSync() |
| } |
| |
| func (g *GitcliCloner) prepareSync() errors.Error { |
| taskData := g.taskData |
| if *taskData.Options.SkipCommitStat { |
| g.syncArgs = append(g.syncArgs, "--filter=blob:none") |
| } |
| remoteUrl, e := giturls.Parse(g.remoteUrl) |
| if e != nil { |
| return errors.Convert(e) |
| } |
| // support proxy |
| if remoteUrl.Scheme == "http" || remoteUrl.Scheme == "https" { |
| if taskData.Options.Proxy != "" { |
| g.syncEnvs = append(g.syncEnvs, fmt.Sprintf("HTTPS_PROXY=%s", taskData.Options.Proxy)) |
| } |
| if remoteUrl.Scheme == "https" && g.ctx.GetConfigReader().GetBool("IN_SECURE_SKIP_VERIFY") { |
| g.syncEnvs = append(g.syncEnvs, "GIT_SSL_NO_VERIFY=true") |
| } |
| } else if remoteUrl.Scheme == "ssh" { |
| var sshCmdArgs []string |
| if taskData.Options.Proxy != "" { |
| parsedProxyURL, e := url.Parse(taskData.Options.Proxy) |
| if e != nil { |
| return errors.BadInput.Wrap(e, "failed to parse the proxy URL") |
| } |
| proxyCommand := "corkscrew" |
| sshCmdArgs = append(sshCmdArgs, "-o", fmt.Sprintf(`ProxyCommand="%s %s %s %%h %%p"`, proxyCommand, parsedProxyURL.Hostname(), parsedProxyURL.Port())) |
| } |
| // support private key |
| if taskData.Options.PrivateKey != "" { |
| pkFile, err := os.CreateTemp("", "gitext-pk") |
| if err != nil { |
| g.logger.Error(err, "create temp private key file error") |
| return errors.Default.New("failed to handle the private key") |
| } |
| if _, e := pkFile.WriteString(taskData.Options.PrivateKey + "\n"); e != nil { |
| g.logger.Error(err, "write private key file error") |
| return errors.Default.New("failed to write the private key") |
| } |
| pkFile.Close() |
| if e := os.Chmod(pkFile.Name(), 0600); e != nil { |
| g.logger.Error(err, "chmod private key file error") |
| return errors.Default.New("failed to modify the private key") |
| } |
| |
| if taskData.Options.Passphrase != "" { |
| pp := exec.CommandContext( |
| g.ctx.GetContext(), |
| "ssh-keygen", "-p", |
| "-P", taskData.Options.Passphrase, |
| "-N", "", |
| "-f", pkFile.Name(), |
| ) |
| if ppout, pperr := pp.CombinedOutput(); pperr != nil { |
| g.logger.Error(pperr, "change private key passphrase error") |
| g.logger.Info(string(ppout)) |
| return errors.Default.New("failed to decrypt the private key") |
| } |
| } |
| defer os.Remove(pkFile.Name()) |
| sshCmdArgs = append(sshCmdArgs, fmt.Sprintf("-i %s -o StrictHostKeyChecking=no", pkFile.Name())) |
| } |
| if len(sshCmdArgs) > 0 { |
| g.syncEnvs = append(g.syncEnvs, fmt.Sprintf("GIT_SSH_COMMAND=ssh %s", strings.Join(sshCmdArgs, " "))) |
| } |
| } |
| return nil |
| } |
| |
| func (g *GitcliCloner) IsIncremental() bool { |
| if g != nil && g.stateManager != nil { |
| if g.stateManager.GetSince() != nil { |
| return true |
| } |
| return g.stateManager.IsIncremental() |
| } |
| |
| return false |
| } |
| |
| func (g *GitcliCloner) CloneRepo() errors.Error { |
| if g.since == nil { |
| // full sync |
| if err := g.fullClone(); err != nil { |
| return err |
| } |
| } else { |
| if g.taskData.Options.NoShallowClone { |
| // data source does not support shallow clone |
| // 1. perform a full clone to accommodate |
| // 2. perform a local shallow clone to reduce the libgit2 memory usage |
| if err := g.doubleClone(); err != nil { |
| return err |
| } |
| } else { |
| // data source support shallow clone |
| if err := g.shallowClone(); err != nil { |
| return err |
| } |
| } |
| if err := g.deepen(); err != nil { |
| return err |
| } |
| g.success = true |
| } |
| return nil |
| } |
| |
| func (g *GitcliCloner) CloseRepo() errors.Error { |
| if g.success { |
| g.logger.Info("save state") |
| return g.stateManager.Close() |
| } |
| return nil |
| } |
| |
| func (g *GitcliCloner) fullClone() errors.Error { |
| return g.gitClone(g.remoteUrl, g.localDir, "--bare") |
| } |
| |
| func (g *GitcliCloner) deepen() errors.Error { |
| // deepen the commits by 1 more step to avoid https://github.com/apache/incubator-devlake/issues/7426 |
| // fixes error described on https://stackoverflow.com/questions/63878612/git-fatal-error-in-object-unshallow-sha-1 |
| // It might be caused by the commit which being deepen has multiple parent(e.g. a merge commit), not sure. |
| if err := g.gitCmd("repack", "-d"); err != nil { |
| return errors.Default.Wrap(err, "failed to repack the repo") |
| } |
| // deepen would fail on a EMPTY repo, ignore the error |
| if err := g.gitFetch("--deepen=1"); err != nil { |
| g.logger.Error(err, "failed to deepen the cloned repo") |
| } |
| return nil |
| } |
| |
| func (g *GitcliCloner) shallowClone() errors.Error { |
| // to fetch newly added commits from ALL branches, we need to the following guide: |
| // https://stackoverflow.com/questions/23708231/git-shallow-clone-clone-depth-misses-remote-branches |
| // 1. clone the repo with depth 1 |
| if err := g.gitClone(g.remoteUrl, g.localDir, "--depth=1", "--bare"); err != nil { |
| return err |
| } |
| // 2. configure to fetch all branches from the remote server, so we can collect new commits from them |
| gitConfig, err := os.OpenFile(path.Join(g.localDir, "config"), os.O_APPEND|os.O_WRONLY, 0644) |
| if err != nil { |
| return errors.Default.Wrap(err, "failed to open git config file") |
| } |
| _, err = gitConfig.WriteString("\tfetch = +refs/heads/*:refs/remotes/origin/*\n") |
| if err != nil { |
| return errors.Default.Wrap(err, "failed to write to git config file") |
| } |
| g.logger.Debug("updated git config to fetch all remote branches") |
| // 3. fetch all branches with depth=1 so the next step would collect fewer commits |
| // (I don't know why, but it reduced total number of commits from 18k to 7k on https://gitlab.com/gitlab-org/gitlab-foss.git with the same parameters) |
| if err := g.gitFetch("--depth=1", "origin"); err != nil { |
| return errors.Default.Wrap(err, "failed to fetch all branches from the remote server") |
| } |
| // 4. fetch all new commits from all branches since the given time |
| if err := g.gitFetch(fmt.Sprintf("--shallow-since=%s", g.since.Format(time.RFC3339))); err != nil { |
| g.logger.Warn(err, "shallow fetch failed") |
| } |
| return nil |
| } |
| |
| func (g *GitcliCloner) doubleClone() errors.Error { |
| intermediaryDir, e := os.MkdirTemp("", "gitextint") |
| if e != nil { |
| return errors.Convert(e) |
| } |
| // step 1: full clone into a intermediary dir |
| backup := g.localDir |
| g.localDir = intermediaryDir |
| if err := g.fullClone(); err != nil { |
| return err |
| } |
| g.localDir = backup |
| // step 2: perform shallow clone against the intermediary dir |
| backup = g.remoteUrl |
| g.remoteUrl = fmt.Sprintf("file://%s", intermediaryDir) // the file:// prefix is required for shallow clone to work |
| if err := g.shallowClone(); err != nil { |
| return err |
| } |
| g.remoteUrl = backup |
| return nil |
| } |
| |
| func (g *GitcliCloner) gitClone(args ...string) errors.Error { |
| args = append(args, g.syncArgs...) |
| return g.git(g.syncEnvs, "", "clone", args...) |
| } |
| |
| func (g *GitcliCloner) gitFetch(args ...string) errors.Error { |
| empty, err := g.repoIsEmpty(args...) |
| if err != nil { |
| g.logger.Error(err, "repo is empty") |
| return err |
| } |
| if empty { |
| g.logger.Info("repo is empty, doesn't need to fetch") |
| return nil |
| } |
| args = append(args, g.syncArgs...) |
| return g.git(g.syncEnvs, g.localDir, "fetch", args...) |
| } |
| |
| func (g *GitcliCloner) repoIsEmpty(args ...string) (bool, errors.Error) { |
| // try to run command: git log |
| // if repo is empty, it will return an error |
| err := g.git(g.syncEnvs, g.localDir, "log") |
| if err != nil { |
| g.logger.Warn(err, "git log failed") |
| return true, nil |
| } |
| return false, nil |
| } |
| |
| func (g *GitcliCloner) gitCmd(gitcmd string, args ...string) errors.Error { |
| return g.git(nil, g.localDir, gitcmd, args...) |
| } |
| |
| func (g *GitcliCloner) git(env []string, dir string, gitcmd string, args ...string) errors.Error { |
| g.logger.Debug("git %s %v", gitcmd, args) |
| args = append([]string{gitcmd}, args...) |
| cmd := exec.CommandContext(g.ctx.GetContext(), "git", args...) |
| cmd.Env = env |
| cmd.Dir = dir |
| return g.execCommand(cmd) |
| } |
| |
| func (g *GitcliCloner) execCommand(cmd *exec.Cmd) errors.Error { |
| output, err := cmd.CombinedOutput() |
| if err != nil { |
| g.logger.Debug("err: %v, output: %s", err, string(output)) |
| outputString := string(output) |
| if strings.Contains(outputString, "fatal: error processing shallow info: 4") || |
| strings.Contains(outputString, "fatal: the remote end hung up unexpectedly") { |
| return ErrNoData |
| } |
| return errors.Default.New(fmt.Sprintf("git cmd %v in %s failed: %s", sanitizeArgs(cmd.Args), cmd.Dir, generateErrMsg(output, err))) |
| } |
| return nil |
| } |
| func generateErrMsg(output []byte, err error) string { |
| errMsg := strings.TrimSpace(string(output)) |
| if errMsg == "" { |
| errMsg = err.Error() |
| } |
| if errMsg == "" { |
| errMsg = "unknown error" |
| } |
| return errMsg |
| } |
| |
| func sanitizeArgs(args []string) []string { |
| var ret []string |
| for _, arg := range args { |
| u, err := url.Parse(arg) |
| if err == nil && u != nil && u.User != nil { |
| password, ok := u.User.Password() |
| if ok { |
| arg = strings.Replace(arg, password, strings.Repeat("*", len(password)), -1) |
| } |
| } |
| ret = append(ret, arg) |
| } |
| return ret |
| } |