计算 Git 文件 blob SHA(处理表情符号等)

问题描述 投票:0回答:1

我正在使用 javascript API 将文件推送到 GitHub。如果 GitHub 上的文件未更改,我希望避免耗尽 API 调用(达到速率限制)。点击树 API 来获取文件的 sha 似乎并不计入速率限制,因此我认为一个好的策略是 (a) 计算将使用 javascript 上传的文件的 sha 并 (b) 将其与GitHub 上现有文件的 sha。此策略适用于纯文本文件。但是当涉及表情符号时,计算会导致不匹配。

我有一个简单的仓库这里

我创建了一个内容/读写(仅限此存储库)个人访问令牌,您可以使用它直接运行代码(例如从 javascript 控制台窗口)。

控制台输出结果是简单的字母“a”文件,其 sha 为“2e65efe2a145dda7ee51d1741299f848e5bf752e”,如 GitHub 和 JavaScript 报告的那样。在我的测试中,它也适用于包含换行符等的 json 文件。但是一旦涉及表情符号,sha 就不再匹配。

如果您有关于如何在涉及表情符号/特殊字符时本地计算 sha 的想法,我将不胜感激。谢谢!


var _repo = 'sospike';
var _owner = 'vsjc91';

var _lettera = 'a';
var _emoji = '🌻';

// I created a contents/read-write (this repo only) personal-access-token that you can use:
var _auth = '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';

// stackoverflow.com/questions/73419876/javascript-replace-all-emoji-in-a-string-to-unicode-and-back
// also read re "deprecated" but "ok for this use case"
// stackoverflow.com/questions/30631927/converting-to-base64-in-javascript-without-deprecated-escape-call
const encodeTextEmojis = function(plainText) {
  return unescape(encodeURIComponent(plainText));
}

const decodeBase64Emojis = function(b64) {
  return decodeURIComponent(escape(b64));
}

// stackoverflow.com/questions/73419876/javascript-replace-all-emoji-in-a-string-to-unicode-and-back
const toBase64 = function(str, adjustEmojis) {
  if (adjustEmojis == true) {
    str = encodeTextEmojis(str);
  }
  return btoa(str);
}

const fromBase64 = function(b64, adjustEmojis) {
  if (adjustEmojis == true) {
    b64 = decodeBase64Emojis(b64);
  }
  return atob(b64);
}

const buildHeaders = function() {
  return {
        Accept: 'application/vnd.github+json',
        Authorization: `Bearer ${_auth}`
      }
}

const tryGetFileResponse = async function(url, headers) {
  const request = { method: "GET", headers: headers };
  try {
    return await fetch(url, request);
  }
  catch {
    // it's ok if the file isn't there
    return null;
  }
}

const getExistingSha = async function(fileName) {
  const url = `https://api.github.com/repos/${_owner}/${_repo}/git/trees/main:?cacheBreaker=${Date.now()}`;
  const headers = buildHeaders();
  const response = await tryGetFileResponse(url, headers);
  if (!response) { return null; }
  const tree = await response.json();
  if (!tree || !tree.tree) { return null; }
  const files = tree.tree;
  
  const file = files.find(function(f) {
    return f.path && f.path.toLowerCase() == fileName.toLowerCase();
  });
  
  if (!file) { return null; }
  console.log('github reported length for ' + fileName + ' of ' + file.size);
  return file.sha;
}

const upload = async function(path, content) {
  const commitMsg = 'testing';
  const encodedContent = toBase64(content, true);

  const existingSha = await getExistingSha(path, content);
  console.log('existing sha: ' + existingSha);

  const putFileBody = { message: commitMsg, content: encodedContent };
  if (existingSha) {
    putFileBody.sha = existingSha;
  }

  await (await fetch(
    `https://api.github.com/repos/${_owner}/${_repo}/contents/${path}`,
    {
      method: 'PUT',
      headers: buildHeaders(),
      body: JSON.stringify(putFileBody),
    }
  )).json();
}

/****************************************/
// calculate sha from javascript
/****************************************/

const utf8ByteLen = function(str) {
  if (!str || str.length == 0) { return 0; }
  const inputBytes = new TextEncoder().encode(str);
  return inputBytes.length;
}
  
// stackoverflow.com/a/40031979/9014097
const buf2hex = function (buffer) { // buffer is an ArrayBuffer
  return Array.prototype.map.call(new Uint8Array(buffer), x => ('00' + x.toString(16)).slice(-2)).join('');
}
  
// stackoverflow.com/questions/63736585/why-does-crypto-subtle-digest-return-an-empty-object
const calcSha1 = async function(str) {
  if (!str || str.length == 0) { return null; }
  const inputBytes = new TextEncoder().encode(str);
  const hashBytes = await window.crypto.subtle.digest('SHA-1', inputBytes);
  const hashedStr = buf2hex(hashBytes);
  return hashedStr;
}
  
// stackoverflow.com/questions/7225313/how-does-git-compute-file-hashes?rq=3
const calcGithubTextContentSha = async function(text) {
  const adjText = encodeTextEmojis(text);
  const len = utf8ByteLen(text);
  const data = `blob ${len}\0${adjText}`;
  const sha = calcSha1(data);
  return sha;
}

/****************************************/
// run the test
/****************************************/

console.log('A text file created manually with just "a" at github.com has:')
const sha_manual = await getExistingSha('plain_typed.txt');
console.log('that sha is: ' + sha_manual);

console.log('Upload a file containing just "a" to github via code and fetch back its sha:')
await upload('plain.txt', _lettera);
const sha = await getExistingSha('plain.txt');
console.log('pushed "a" file with sha: ' + sha);

console.log('Compare vs. calculate from javascript');
const sha_js = await calcGithubTextContentSha(_lettera);
console.log('js sha for "a" file: ' + sha_js);

console.log('Now try an emoji character');
await upload('emoji.txt', _emoji);
const sha_emoji = await getExistingSha('emoji.txt');
console.log('github emoji sha: ' + sha_emoji);

const sha_emoji_js = await calcGithubTextContentSha(_emoji);
console.log('js emoji sha: ' + sha_emoji_js);


(可能的线索? - 我注意到仅包含字母“a”的文件的文件大小是 1 或 2,具体取决于它是使用 GitHub.com 手动编辑还是使用此代码推送)。

这是日志输出:

A text file created manually with just "a" at github.com has:
github reported length for plain_typed.txt of 2
that sha is: 78981922613b2afb6025042ff6bd878ac1994e85

Upload a file containing just "a" to github via code and fetch back its sha:
github reported length for plain.txt of 1
existing sha: 2e65efe2a145dda7ee51d1741299f848e5bf752e
github reported length for plain.txt of 1
pushed "a" file with sha: 2e65efe2a145dda7ee51d1741299f848e5bf752e

Compare vs. calculate from javascript
js sha for "a" file: 2e65efe2a145dda7ee51d1741299f848e5bf752e

Now try an emoji character
github reported length for emoji.txt of 4
existing sha: 9ad8dd6d25e074eca9e19e06458bb9e7a314a875
github reported length for emoji.txt of 4
github emoji sha: 9ad8dd6d25e074eca9e19e06458bb9e7a314a875
js emoji sha: 553045c77ac300d4907c43bf0de77326ec610474
git byte github-api endianness
1个回答
0
投票

有一些明显的问题:

encodeTextEmojis
decodeBase64Emojis
函数分别使用
unescape
escape
。不建议使用这些函数来处理表情符号等多字节字符。此外,这些方法已被弃用。
由于
toBase64
fromBase64
函数调用上述编码和解码函数,这可能会导致表情符号的错误处理。
calcGithubTextContentSha
函数中,调用
encodeTextEmojis
函数可能会更改文本,从而导致 SHA-1 哈希值与预期不同。

GitHub 报告的手动创建的文件与以编程方式上传的文件的文件大小差异可能是编码差异的迹象,尽管它没有直接反映在代码中。它也可能是在一个环境中添加的 eol(行尾字符),而不是在另一个环境中添加的。

使用 TextEncoder 更安全:

// Use TextEncoder for encoding text to bytes consistently.
const textEncoder = new TextEncoder();

// That function will accurately calculate the byte length of a string, including multibyte characters.
const utf8ByteLen = function(str) {
  if (!str || str.length == 0) { return 0; }
  return textEncoder.encode(str).length;
}

// That function will convert a buffer to hexadecimal representation.
const buf2hex = function (buffer) {
  return Array.prototype.map.call(new Uint8Array(buffer), x => ('00' + x.toString(16)).slice(-2)).join('');
}

// That function will calculate the SHA-1 hash of the input string.
const calcSha1 = async function(str) {
  if (!str || str.length == 0) { return null; }
  const inputBytes = textEncoder.encode(str);  // Encoding the text to bytes using TextEncoder
  const hashBytes = await window.crypto.subtle.digest('SHA-1', inputBytes);
  return buf2hex(hashBytes);
}

// That function will calculate the SHA-1 hash in the same format as GitHub does for text content.
const calcGithubTextContentSha = async function(text) {
  const len = utf8ByteLen(text);  // Getting the accurate byte length
  const data = `blob ${len}\0${text}`;  // Preparing the data string
  return await calcSha1(data);  // Calculating the SHA-1 hash
}

/****************************************/
// run the test
/****************************************/

console.log('A text file created manually with just "a" at github.com has:');
const sha_manual = await getExistingSha('plain_typed.txt');
console.log('that sha is: ' + sha_manual);

console.log('Upload a file containing just "a" to github via code and fetch back its sha:');
await upload('plain.txt', _lettera);
const sha = await getExistingSha('plain.txt');
console.log('pushed "a" file with sha: ' + sha);

console.log('Compare vs. calculate from javascript');
const sha_js = await calcGithubTextContentSha(_lettera);
console.log('js sha for "a" file: ' + sha_js);

console.log('Now try an emoji character');
await upload('emoji.txt', _emoji);
const sha_emoji = await getExistingSha('emoji.txt');
console.log('github emoji sha: ' + sha_emoji);

const sha_emoji_js = await calcGithubTextContentSha(_emoji);
console.log('js emoji sha: ' + sha_emoji_js);

utf8ByteLen
calcSha1
函数利用
textEncoder
来计算字符串的准确字节长度,包括多字节字符。
更新了
calcGithubTextContentSha
函数,删除了
encodeTextEmojis
函数的使用,直接使用文本输入,同时准备用于 SHA-1 哈希计算的数据字符串。

注意:这是文件的 SHA1,不是您在 GitHub 上需要的 SHA1,它代表提交的 SHA1,并且不仅仅包含文件的内容,但是:

  • 提交的源树(分解为所有子树和 blob)
  • 父提交 sha1
  • 作者信息
  • 提交者信息(这些信息可以不同!)
  • 提交消息
© www.soinside.com 2019 - 2024. All rights reserved.