Skip to content

批量处理示例

演示如何使用共享字体进行高效的批量和并行处理。

共享字体处理

高效批量处理的关键是加载一次字体并重复使用。

typescript
// 从环境特定入口点导入
import { HtmlLayoutParser, CharLayout } from 'html-layout-parser/web';

// ✅ 正确:加载一次字体,解析多个文档
async function efficientBatchProcessing(documents: string[]): Promise<CharLayout[][]> {
  const parser = new HtmlLayoutParser();
  await parser.init();

  try {
    // 只加载一次字体
    const fontResponse = await fetch('/fonts/arial.ttf');
    const fontData = new Uint8Array(await fontResponse.arrayBuffer());
    parser.loadFont(fontData, 'Arial');
    parser.setDefaultFont(1);

    // 使用相同字体解析所有文档
    const results: CharLayout[][] = [];
    for (const html of documents) {
      const layouts = parser.parse(html, { viewportWidth: 800 });
      results.push(layouts);
    }

    return results;
  } finally {
    parser.destroy();
  }
}

// ❌ 低效:为每个文档加载字体
async function inefficientProcessing(documents: string[]): Promise<CharLayout[][]> {
  const results: CharLayout[][] = [];

  for (const html of documents) {
    const parser = new HtmlLayoutParser();
    await parser.init();

    // 为每个文档加载字体 - 浪费资源!
    const fontResponse = await fetch('/fonts/arial.ttf');
    const fontData = new Uint8Array(await fontResponse.arrayBuffer());
    parser.loadFont(fontData, 'Arial');
    parser.setDefaultFont(1);

    const layouts = parser.parse(html, { viewportWidth: 800 });
    results.push(layouts);

    parser.destroy();
  }

  return results;
}

顺序批量处理

typescript
import { HtmlLayoutParser, CharLayout, MemoryMetrics } from 'html-layout-parser/web';

interface BatchResult {
  index: number;
  characterCount: number;
  processingTime: number;
  success: boolean;
  error?: string;
}

class SequentialBatchProcessor {
  private parser: HtmlLayoutParser;
  private initialized = false;
  private loadedFonts: Map<string, number> = new Map();

  constructor() {
    this.parser = new HtmlLayoutParser();
  }

  async init(): Promise<void> {
    if (this.initialized) return;
    await this.parser.init();
    this.initialized = true;
  }

  async loadFont(fontData: Uint8Array, fontName: string): Promise<number> {
    if (this.loadedFonts.has(fontName)) {
      return this.loadedFonts.get(fontName)!;
    }

    const fontId = this.parser.loadFont(fontData, fontName);
    if (fontId > 0) {
      this.loadedFonts.set(fontName, fontId);
      if (this.loadedFonts.size === 1) {
        this.parser.setDefaultFont(fontId);
      }
    }

    return fontId;
  }

  async processBatch(
    documents: Array<{ html: string; css?: string }>,
    options: { viewportWidth: number } = { viewportWidth: 800 },
    onProgress?: (index: number, total: number) => void
  ): Promise<BatchResult[]> {
    const results: BatchResult[] = [];

    for (let i = 0; i < documents.length; i++) {
      const doc = documents[i];
      const startTime = performance.now();

      try {
        const layouts = this.parser.parse(doc.html, {
          viewportWidth: options.viewportWidth,
          css: doc.css
        });

        results.push({
          index: i,
          characterCount: layouts.length,
          processingTime: performance.now() - startTime,
          success: true
        });
      } catch (error) {
        results.push({
          index: i,
          characterCount: 0,
          processingTime: performance.now() - startTime,
          success: false,
          error: error instanceof Error ? error.message : String(error)
        });
      }

      onProgress?.(i + 1, documents.length);
    }

    return results;
  }

  getMemoryMetrics(): MemoryMetrics | null {
    return this.parser.getMemoryMetrics();
  }

  destroy(): void {
    this.parser.destroy();
    this.loadedFonts.clear();
    this.initialized = false;
  }
}

// 使用示例
async function sequentialBatchExample() {
  const processor = new SequentialBatchProcessor();

  try {
    await processor.init();

    const fontResponse = await fetch('/fonts/arial.ttf');
    const fontData = new Uint8Array(await fontResponse.arrayBuffer());
    await processor.loadFont(fontData, 'Arial');

    const documents = Array.from({ length: 50 }, (_, i) => ({
      html: `<div class="item">项目 ${i + 1}</div>`,
      css: '.item { font-size: 16px; color: #333333FF; }'
    }));

    const results = await processor.processBatch(
      documents,
      { viewportWidth: 600 },
      (current, total) => {
        console.log(`进度: ${current}/${total}`);
      }
    );

    const successful = results.filter(r => r.success);
    const totalChars = successful.reduce((sum, r) => sum + r.characterCount, 0);
    console.log(`已处理: ${successful.length}/${results.length}, 总字符数: ${totalChars}`);

  } finally {
    processor.destroy();
  }
}

并行文档解析

typescript
import { HtmlLayoutParser, CharLayout } from 'html-layout-parser/web';

// 分块并行处理
async function chunkedParallelProcessing(
  documents: string[],
  chunkSize: number = 10
): Promise<CharLayout[][]> {
  const parser = new HtmlLayoutParser();
  await parser.init();

  try {
    const fontResponse = await fetch('/fonts/arial.ttf');
    const fontData = new Uint8Array(await fontResponse.arrayBuffer());
    parser.loadFont(fontData, 'Arial');
    parser.setDefaultFont(1);

    const results: CharLayout[][] = [];

    for (let i = 0; i < documents.length; i += chunkSize) {
      const chunk = documents.slice(i, i + chunkSize);
      
      const chunkResults = await Promise.all(
        chunk.map(html => 
          Promise.resolve(parser.parse(html, { viewportWidth: 800 }))
        )
      );

      results.push(...chunkResults);
      console.log(`已处理 ${Math.min(i + chunkSize, documents.length)}/${documents.length}`);
    }

    return results;
  } finally {
    parser.destroy();
  }
}

// 每个文档使用不同 CSS 的并行处理
interface DocumentWithCSS {
  html: string;
  css?: string;
}

async function parallelWithCSS(documents: DocumentWithCSS[]): Promise<CharLayout[][]> {
  const parser = new HtmlLayoutParser();
  await parser.init();

  try {
    const fontResponse = await fetch('/fonts/arial.ttf');
    const fontData = new Uint8Array(await fontResponse.arrayBuffer());
    parser.loadFont(fontData, 'Arial');
    parser.setDefaultFont(1);

    const results = await Promise.all(
      documents.map(doc => 
        Promise.resolve(parser.parse(doc.html, {
          viewportWidth: 800,
          css: doc.css
        }))
      )
    );

    return results;
  } finally {
    parser.destroy();
  }
}

高吞吐量模式

typescript
import { HtmlLayoutParser, CharLayout } from 'html-layout-parser/web';

class HighThroughputParser {
  private parser: HtmlLayoutParser;
  private initialized = false;
  private stats = {
    documentsProcessed: 0,
    charactersProcessed: 0,
    totalTime: 0
  };

  constructor() {
    this.parser = new HtmlLayoutParser();
  }

  async init(): Promise<void> {
    if (this.initialized) return;
    await this.parser.init();
    this.initialized = true;
  }

  async preloadFonts(fonts: Array<{ data: Uint8Array; name: string }>): Promise<void> {
    for (const font of fonts) {
      const fontId = this.parser.loadFont(font.data, font.name);
      if (fontId > 0 && fonts.indexOf(font) === 0) {
        this.parser.setDefaultFont(fontId);
      }
    }
  }

  parse(html: string, viewportWidth: number = 800, css?: string): CharLayout[] {
    const startTime = performance.now();
    
    const result = this.parser.parse(html, {
      viewportWidth,
      css,
      mode: 'flat'
    });

    this.stats.documentsProcessed++;
    this.stats.charactersProcessed += result.length;
    this.stats.totalTime += performance.now() - startTime;

    return result;
  }

  async processBatchParallel(
    documents: Array<{ html: string; css?: string }>,
    viewportWidth: number = 800,
    concurrency: number = 10
  ): Promise<CharLayout[][]> {
    const results: CharLayout[][] = [];

    for (let i = 0; i < documents.length; i += concurrency) {
      const batch = documents.slice(i, i + concurrency);
      
      const batchResults = await Promise.all(
        batch.map(doc => 
          Promise.resolve(this.parse(doc.html, viewportWidth, doc.css))
        )
      );

      results.push(...batchResults);
    }

    return results;
  }

  getStats() {
    return {
      ...this.stats,
      averageTimePerDocument: this.stats.documentsProcessed > 0 
        ? this.stats.totalTime / this.stats.documentsProcessed 
        : 0,
      throughput: this.stats.totalTime > 0
        ? (this.stats.charactersProcessed / (this.stats.totalTime / 1000))
        : 0
    };
  }

  resetStats(): void {
    this.stats = { documentsProcessed: 0, charactersProcessed: 0, totalTime: 0 };
  }

  destroy(): void {
    this.parser.destroy();
    this.initialized = false;
  }
}

// 使用示例
async function highThroughputExample() {
  const parser = new HighThroughputParser();

  try {
    await parser.init();

    const fontResponse = await fetch('/fonts/arial.ttf');
    const fontData = new Uint8Array(await fontResponse.arrayBuffer());
    await parser.preloadFonts([{ data: fontData, name: 'Arial' }]);

    const documents = Array.from({ length: 1000 }, (_, i) => ({
      html: `<div>文档 ${i + 1} 内容。</div>`
    }));

    console.log(`处理 ${documents.length} 个文档...`);
    const startTime = performance.now();

    await parser.processBatchParallel(documents, 800, 20);

    const stats = parser.getStats();
    console.log(`\n文档数: ${stats.documentsProcessed}`);
    console.log(`字符数: ${stats.charactersProcessed}`);
    console.log(`吞吐量: ${stats.throughput.toFixed(0)} 字符/秒`);

  } finally {
    parser.destroy();
  }
}

内存高效的批量处理

typescript
import { HtmlLayoutParser, CharLayout, MemoryMetrics } from 'html-layout-parser/web';

class MemoryEfficientProcessor {
  private parser: HtmlLayoutParser;
  private maxMemoryMB: number;

  constructor(maxMemoryMB: number = 40) {
    this.parser = new HtmlLayoutParser();
    this.maxMemoryMB = maxMemoryMB;
  }

  async init(): Promise<void> {
    await this.parser.init();
  }

  async loadFont(fontData: Uint8Array, fontName: string): Promise<number> {
    const fontId = this.parser.loadFont(fontData, fontName);
    if (fontId > 0) this.parser.setDefaultFont(fontId);
    return fontId;
  }

  private checkMemory(): { usage: number; exceedsLimit: boolean } {
    const metrics = this.parser.getMemoryMetrics();
    const usageMB = metrics ? metrics.totalMemoryUsage / 1024 / 1024 : 0;
    return { usage: usageMB, exceedsLimit: usageMB > this.maxMemoryMB };
  }

  // 流式处理 - 立即处理和处理结果
  async streamProcess(
    documents: Array<{ html: string; css?: string }>,
    viewportWidth: number,
    onResult: (index: number, result: CharLayout[]) => void | Promise<void>
  ): Promise<{ processed: number; totalChars: number }> {
    let processed = 0;
    let totalChars = 0;

    for (let i = 0; i < documents.length; i++) {
      const doc = documents[i];

      const layouts = this.parser.parse(doc.html, {
        viewportWidth,
        css: doc.css
      });

      await onResult(i, layouts);

      processed++;
      totalChars += layouts.length;

      if ((i + 1) % 100 === 0) {
        const mem = this.checkMemory();
        console.log(`已处理 ${i + 1}/${documents.length} (内存: ${mem.usage.toFixed(2)} MB)`);
      }
    }

    return { processed, totalChars };
  }

  destroy(): void {
    this.parser.destroy();
  }
}

// 使用示例
async function memoryEfficientExample() {
  const processor = new MemoryEfficientProcessor(40);

  try {
    await processor.init();

    const fontResponse = await fetch('/fonts/arial.ttf');
    const fontData = new Uint8Array(await fontResponse.arrayBuffer());
    await processor.loadFont(fontData, 'Arial');

    const documents = Array.from({ length: 500 }, (_, i) => ({
      html: `<div>文档 ${i + 1}</div>`
    }));

    const stats = await processor.streamProcess(
      documents,
      800,
      async (index, layouts) => {
        // 在这里保存到文件或数据库
        if (index % 100 === 0) {
          console.log(`已保存文档 ${index} (${layouts.length} 字符)`);
        }
      }
    );

    console.log(`已处理: ${stats.processed}, 总字符数: ${stats.totalChars}`);

  } finally {
    processor.destroy();
  }
}

Released under the MIT License.