Skip to content

对象解析实现

Git Doctor 核心解析器的详细实现说明。

源码位置

src/core/git-parser.ts

核心函数

parseLooseObject

解析松散对象文件。

typescript
export function parseLooseObject(
  gitDir: string,
  sha: string
): GitObject | null {
  // 构建对象路径
  const objectPath = path.join(
    gitDir,
    'objects',
    sha.slice(0, 2),
    sha.slice(2)
  )

  if (!fs.existsSync(objectPath)) {
    return null
  }

  try {
    // 读取压缩数据
    const compressed = fs.readFileSync(objectPath)

    // Zlib 解压
    const decompressed = zlib.inflateSync(compressed)

    // 找到 NUL 字节分隔符
    const nullIndex = decompressed.indexOf(0)

    // 解析头部
    const header = decompressed.slice(0, nullIndex).toString()
    const [type, sizeStr] = header.split(' ')

    // 提取内容
    const content = decompressed.slice(nullIndex + 1)

    return {
      type: type as 'blob' | 'tree' | 'commit' | 'tag',
      size: parseInt(sizeStr, 10),
      content
    }
  } catch (error) {
    console.error(`Failed to parse object ${sha}:`, error)
    return null
  }
}

parseTreeObject

解析 Tree 对象的二进制内容。

typescript
export function parseTreeObject(content: Buffer): TreeEntry[] {
  const entries: TreeEntry[] = []
  let offset = 0

  while (offset < content.length) {
    // 1. 找空格,提取 mode
    const spaceIndex = content.indexOf(0x20, offset)
    if (spaceIndex === -1) break

    const mode = content.slice(offset, spaceIndex).toString()

    // 2. 找 NUL,提取 name
    const nullIndex = content.indexOf(0x00, spaceIndex)
    if (nullIndex === -1) break

    const name = content.slice(spaceIndex + 1, nullIndex).toString()

    // 3. 读取 20 字节 SHA(二进制格式)
    const shaBuffer = content.slice(nullIndex + 1, nullIndex + 21)
    if (shaBuffer.length < 20) break

    const sha = shaBuffer.toString('hex')

    entries.push({ mode, name, sha })
    offset = nullIndex + 21
  }

  return entries
}

parseCommitObject

解析 Commit 对象。

typescript
export function parseCommitObject(
  sha: string,
  content: Buffer
): CommitInfo {
  const text = content.toString('utf-8')
  const lines = text.split('\n')

  const info: CommitInfo = {
    sha,
    tree: '',
    parents: [],
    author: '',
    authorEmail: '',
    authorDate: new Date(),
    committer: '',
    committerEmail: '',
    committerDate: new Date(),
    message: ''
  }

  let i = 0

  // 解析头部字段(直到空行)
  while (i < lines.length && lines[i] !== '') {
    const line = lines[i]

    if (line.startsWith('tree ')) {
      info.tree = line.slice(5)
    }
    else if (line.startsWith('parent ')) {
      info.parents.push(line.slice(7))
    }
    else if (line.startsWith('author ')) {
      // 格式: author Name <email> timestamp timezone
      const match = line.match(/^author (.+) <(.+)> (\d+)/)
      if (match) {
        info.author = match[1]
        info.authorEmail = match[2]
        info.authorDate = new Date(parseInt(match[3], 10) * 1000)
      }
    }
    else if (line.startsWith('committer ')) {
      const match = line.match(/^committer (.+) <(.+)> (\d+)/)
      if (match) {
        info.committer = match[1]
        info.committerEmail = match[2]
        info.committerDate = new Date(parseInt(match[3], 10) * 1000)
      }
    }

    i++
  }

  // 跳过空行,获取 commit message
  info.message = lines.slice(i + 1).join('\n').trim()

  return info
}

walkLooseObjects

遍历所有松散对象。

typescript
export function* walkLooseObjects(
  gitDir: string
): Generator<{ sha: string; path: string }> {
  const objectsDir = path.join(gitDir, 'objects')

  if (!fs.existsSync(objectsDir)) {
    return
  }

  const entries = fs.readdirSync(objectsDir, { withFileTypes: true })

  for (const entry of entries) {
    // 跳过 pack 和 info 目录
    if (!entry.isDirectory() ||
        entry.name === 'pack' ||
        entry.name === 'info') {
      continue
    }

    // 检查是否是有效的对象目录(2字符十六进制)
    if (!/^[0-9a-f]{2}$/.test(entry.name)) {
      continue
    }

    const subDir = path.join(objectsDir, entry.name)
    const objects = fs.readdirSync(subDir)

    for (const objFile of objects) {
      const sha = entry.name + objFile
      yield { sha, path: path.join(subDir, objFile) }
    }
  }
}

readRef

读取引用文件,支持符号引用递归解析。

typescript
export function readRef(
  gitDir: string,
  refPath: string
): string | null {
  const fullPath = path.join(gitDir, refPath)

  if (!fs.existsSync(fullPath)) {
    return null
  }

  const content = fs.readFileSync(fullPath, 'utf-8').trim()

  // 检查是否是符号引用
  if (content.startsWith('ref: ')) {
    const targetRef = content.slice(5)
    return readRef(gitDir, targetRef)  // 递归解析
  }

  return content  // 返回 SHA
}

walkTree

递归遍历 Tree,获取所有文件。

typescript
export function walkTree(
  gitDir: string,
  treeSha: string,
  basePath: string = ''
): Array<{ path: string; sha: string; mode: string }> {
  const files: Array<{ path: string; sha: string; mode: string }> = []

  const obj = parseLooseObject(gitDir, treeSha)
  if (!obj || obj.type !== 'tree') {
    return files
  }

  const entries = parseTreeObject(obj.content)

  for (const entry of entries) {
    const fullPath = basePath ? `${basePath}/${entry.name}` : entry.name

    if (entry.mode === '40000' || entry.mode === '040000') {
      // 目录,递归遍历
      files.push(...walkTree(gitDir, entry.sha, fullPath))
    } else {
      // 文件
      files.push({
        path: fullPath,
        sha: entry.sha,
        mode: entry.mode
      })
    }
  }

  return files
}

getAllCommits

BFS 遍历获取所有提交。

typescript
export async function getAllCommits(gitDir: string): Promise<string[]> {
  const commits = new Set<string>()
  const queue: string[] = []

  // 从所有分支开始
  for (const branch of getLocalBranches(gitDir)) {
    queue.push(branch.sha)
  }
  for (const branch of getRemoteBranches(gitDir)) {
    queue.push(branch.sha)
  }

  // BFS 遍历
  while (queue.length > 0) {
    const sha = queue.shift()!

    if (commits.has(sha)) {
      continue
    }

    const obj = parseLooseObject(gitDir, sha)
    if (!obj) {
      continue  // 可能在 pack 文件中
    }

    if (obj.type === 'commit') {
      commits.add(sha)
      const commitInfo = parseCommitObject(sha, obj.content)
      queue.push(...commitInfo.parents)
    }
  }

  return Array.from(commits)
}

getFileHistory

追踪文件的修改历史。

typescript
export async function getFileHistory(
  gitDir: string,
  filePath: string
): Promise<FileHistoryEntry[]> {
  const history: FileHistoryEntry[] = []
  const commits = await getAllCommitInfos(gitDir)

  let previousBlobSha: string | null = null

  // 按时间正序遍历
  const sortedCommits = [...commits].sort(
    (a, b) => a.authorDate.getTime() - b.authorDate.getTime()
  )

  for (const commit of sortedCommits) {
    const files = walkTree(gitDir, commit.tree)
    const file = files.find(f => f.path === filePath)

    if (file) {
      if (previousBlobSha === null) {
        // 文件首次出现
        history.push({
          commitSha: commit.sha,
          changeType: 'added',
          author: commit.author,
          date: commit.authorDate,
          message: commit.message
        })
      } else if (file.sha !== previousBlobSha) {
        // 文件被修改
        history.push({
          commitSha: commit.sha,
          changeType: 'modified',
          author: commit.author,
          date: commit.authorDate,
          message: commit.message
        })
      }
      previousBlobSha = file.sha
    } else if (previousBlobSha !== null) {
      // 文件被删除
      history.push({
        commitSha: commit.sha,
        changeType: 'deleted',
        author: commit.author,
        date: commit.authorDate,
        message: commit.message
      })
      previousBlobSha = null
    }
  }

  return history.reverse()  // 最新在前
}

类型定义

typescript
// types/index.ts

export interface GitObject {
  type: 'blob' | 'tree' | 'commit' | 'tag'
  size: number
  content: Buffer
}

export interface TreeEntry {
  mode: string
  name: string
  sha: string
}

export interface CommitInfo {
  sha: string
  tree: string
  parents: string[]
  author: string
  authorEmail: string
  authorDate: Date
  committer: string
  committerEmail: string
  committerDate: Date
  message: string
}

export interface FileHistoryEntry {
  commitSha: string
  changeType: 'added' | 'modified' | 'deleted'
  author: string
  authorEmail: string
  date: Date
  message: string
}

性能考虑

缓存策略

typescript
const objectCache = new Map<string, GitObject>()

export function parseLooseObjectCached(
  gitDir: string,
  sha: string
): GitObject | null {
  if (objectCache.has(sha)) {
    return objectCache.get(sha)!
  }

  const obj = parseLooseObject(gitDir, sha)
  if (obj) {
    objectCache.set(sha, obj)
  }
  return obj
}

并行处理

typescript
// 批量解析提交
async function parseCommitsBatch(
  gitDir: string,
  shas: string[]
): Promise<CommitInfo[]> {
  return Promise.all(
    shas.map(async sha => {
      const obj = parseLooseObject(gitDir, sha)
      if (obj?.type === 'commit') {
        return parseCommitObject(sha, obj.content)
      }
      return null
    })
  ).then(results => results.filter(Boolean) as CommitInfo[])
}

基于 MIT 许可发布