Skip to content

Commit 2b46825

Browse files
修复从帖子中提取solana地址, 有些时候会提取不正常的问题 (#6)
1 parent 347c9f2 commit 2b46825

3 files changed

Lines changed: 51 additions & 46 deletions

File tree

package-lock.json

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "v2ex-api-parser",
3-
"version": "0.0.9",
3+
"version": "0.0.10",
44
"description": "专门用于解析V2EX帖子内容的npm包,支持提取发帖人信息、ID、标题和回复内容",
55
"type": "module",
66
"main": "dist/index.esm.js",

src/index.js

Lines changed: 48 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -494,25 +494,22 @@ class V2exParser {
494494

495495
// 方法1: 从script标签中提取
496496
const scriptContent = $('script').text();
497-
const addressMatch = scriptContent.match(/const address = "([A-Za-z0-9]{32-44})"/);
497+
const addressMatch = scriptContent.match(/const address = "([A-Za-z0-9]{32,44})"/);
498498
if (addressMatch) {
499499
result.solanaAddress = addressMatch[1];
500500
}
501501

502-
// 方法2: 从页面文本中查找Solana地址格式
502+
// 方法2: 使用更稳健的文本提取(带边界与URL过滤)
503503
const pageText = $.text();
504-
const solanaAddressRegex = /[1-9A-HJ-NP-Za-km-z]{32,44}/g;
505-
const addresses = pageText.match(solanaAddressRegex);
506-
if (addresses && addresses.length > 0) {
507-
// 过滤掉可能的其他Base58编码字符串
508-
const validAddress = addresses.find(addr => addr.length >= 32 && addr.length <= 44);
509-
if (validAddress) {
510-
result.solanaAddress = validAddress;
511-
}
504+
const sanitizedText = this.removeUrlTransactionSignatures(pageText);
505+
const extracted = this.extractSolanaAddressesWithBoundary(sanitizedText);
506+
if (extracted && extracted.length > 0) {
507+
result.solanaAddress = result.solanaAddress || extracted[0];
512508
}
513509

514510
// 提取sol域名
515-
result.solanaDomain = this.extractSolanaDomain(pageText);
511+
const domains = this.extractSolanaDomainsFromText(pageText);
512+
result.solanaDomain = domains && domains.length > 0 ? domains[0] : null;
516513

517514
return result;
518515
}
@@ -524,26 +521,27 @@ class V2exParser {
524521
*/
525522
extractSolanaDomain(text) {
526523
if (!text) return null;
524+
const domains = this.extractSolanaDomainsFromText(text);
525+
return domains && domains.length > 0 ? domains[0] : null;
526+
}
527527

528-
// 匹配.sol域名的正则表达式
529-
// 确保域名前后没有字符(空白字符除外)
530-
const domainRegex = /(?<!\S)\.sol(?=\s|$)/g;
531-
const matches = text.match(domainRegex);
532-
533-
if (matches && matches.length > 0) {
534-
// 返回第一个匹配的域名
535-
return matches[0];
536-
}
537-
538-
// 如果没有找到.sol,尝试查找其他可能的域名格式
539-
const generalDomainRegex = /(?<!\S)([a-zA-Z0-9-]+\.sol)(?=\s|$)/g;
540-
const generalMatches = text.match(generalDomainRegex);
541-
542-
if (generalMatches && generalMatches.length > 0) {
543-
return generalMatches[0];
528+
/**
529+
* 从文本中移除交易签名类URL,避免将签名误识别为地址
530+
* 例如: solscan.io/tx/<signature>、explorer.solana.com/tx/<signature>
531+
* @param {string} text
532+
* @returns {string}
533+
*/
534+
removeUrlTransactionSignatures(text) {
535+
if (!text) return '';
536+
const patterns = [
537+
/https?:\/\/[^\s]*\/(tx|transaction|confirmTransaction)\/[^\s]*/gi,
538+
/https?:\/\/[^\s]*\/txs\/[^\s]*/gi
539+
];
540+
let sanitized = text;
541+
for (const p of patterns) {
542+
sanitized = sanitized.replace(p, ' ');
544543
}
545-
546-
return null;
544+
return sanitized;
547545
}
548546

549547
/**
@@ -645,7 +643,7 @@ class V2exParser {
645643

646644
// 匹配.sol域名的正则表达式
647645
// 确保域名前后没有字符(空白字符除外)
648-
const domainRegex = /(?<!\S)([a-zA-Z0-9-]+\.sol)(?=\s|$)/g;
646+
const domainRegex = /(?<!\S)([a-zA-Z0-9_-]+\.sol)(?=\s|$)/g;
649647
const matches = text.match(domainRegex);
650648

651649
if (matches && matches.length > 0) {
@@ -657,9 +655,9 @@ class V2exParser {
657655
// 确保以.sol结尾
658656
if (!domain.endsWith('.sol')) return false;
659657

660-
// 确保域名部分只包含字母、数字和连字符
658+
// 确保域名部分只包含字母、数字、下划线和连字符
661659
const domainPart = domain.replace('.sol', '');
662-
if (!/^[a-zA-Z0-9-]+$/.test(domainPart)) return false;
660+
if (!/^[a-zA-Z0-9_-]+$/.test(domainPart)) return false;
663661

664662
return true;
665663
});
@@ -687,20 +685,21 @@ class V2exParser {
687685

688686
const addresses = [];
689687

690-
// 使用更精确的正则表达式,确保地址前后有边界
691-
// 边界可以是:行首、行尾、空格、标点符号等
692-
const addressRegex = /(?<!\S)([1-9A-HJ-NP-Za-km-z]{32,44})(?=\s|$|[^\w])/g;
688+
// 使用更稳健的正则表达式,允许中文标点/文字作为边界
689+
// 左边界:行首或非Base58字符;右边界:非Base58字符或行尾
690+
const addressRegex = /(?:^|[^1-9A-HJ-NP-Za-km-z])([1-9A-HJ-NP-Za-km-z]{32,44})(?![1-9A-HJ-NP-Za-km-z])/g;
693691

694692
let match;
695693
while ((match = addressRegex.exec(text)) !== null) {
696694
const address = match[1];
697695

698-
// 验证地址的有效性
699-
if (this.isValidSolanaAddress(address)) {
700-
// 进一步检查:确保不是URL的一部分
701-
if (!this.isPartOfUrl(text, match.index, address.length)) {
702-
addresses.push(address);
703-
}
696+
// 计算地址在原文本中的真实起始位置(排除左边界占位字符)
697+
const precedingOffset = match[0].length - match[1].length;
698+
const addressStartIndex = match.index + precedingOffset;
699+
700+
// 验证地址的有效性并确保不是URL的一部分
701+
if (this.isValidSolanaAddress(address) && !this.isPartOfUrl(text, addressStartIndex, address.length)) {
702+
addresses.push(address);
704703
}
705704
}
706705

@@ -723,8 +722,8 @@ class V2exParser {
723722
*/
724723
isPartOfUrl(text, startIndex, addressLength) {
725724
// 检查地址前后是否有URL特征
726-
const beforeText = text.substring(Math.max(0, startIndex - 20), startIndex);
727-
const afterText = text.substring(startIndex + addressLength, Math.min(text.length, startIndex + addressLength + 20));
725+
const beforeText = text.substring(Math.max(0, startIndex - 200), startIndex);
726+
const afterText = text.substring(startIndex + addressLength, Math.min(text.length, startIndex + addressLength + 200));
728727

729728
// URL特征:包含http、https、www、.com、.io等
730729
const urlPatterns = [
@@ -737,6 +736,12 @@ class V2exParser {
737736

738737
const combinedText = beforeText + afterText;
739738

739+
// 直接邻近检查,避免窗口截断导致漏判
740+
const immediateBefore = text.substring(Math.max(0, startIndex - 4), startIndex);
741+
if (immediateBefore === '/tx/' || immediateBefore === 'tx/' || immediateBefore === '/address/' || immediateBefore === 'address/') {
742+
return true;
743+
}
744+
740745
return urlPatterns.some(pattern => pattern.test(combinedText));
741746
}
742747

0 commit comments

Comments
 (0)