/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
* This file is part of the LibreOffice project.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*
* This file incorporates work covered by the following license notice:
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed
* with this work for additional information regarding copyright
* ownership. The ASF licenses this file to you under the Apache
* License, Version 2.0 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of
* the License at http://www.apache.org/licenses/LICENSE-2.0 .
*/
#include <stdio.h>
#include <string_view>
#include <sal/main.h>
#include <osl/file.h>
#include <osl/thread.h>
#include <rtl/alloc.h>
#include <rtl/ustring.hxx>
#include <rtl/strbuf.hxx>
#include <o3tl/string_view.hxx>
#include <pdfparse.hxx>
using namespace pdfparse;
static void printHelp(
const char * pExe )
{
fprintf( stdout,
"USAGE: %s [-h,--help]\n"
" %s [-pw, --password ] []\n"
" %s <-a, --extract-add-streams> [-pw, --password ] []\n"
" %s <-f, --extract-fonts> [-pw, --password ] []\n"
" %s <-o, --extract-objects> [:][,[:g1][,...]] [-pw, --password ] []\n"
" -h, --help: show help\n"
" -a, --extract-add-streams: extracts additional streams to outputfile_object\n"
" and prints the mimetype found to stdout\n"
" -f, --extract-fonts: extracts fonts (currently only type1 and truetype are supported\n"
" -o, --extract-objects: extracts object streams, the syntax of the argument is comma separated\n"
" object numbers, where object number and generation number are separated by \':\'\n"
" an omitted generation number defaults to 0\n"
" -pw, --password: use password for decryption\n"
"\n"
"note: -f, -a, -o and normal unzip operation are mutually exclusive\n"
, pExe, pExe, pExe, pExe, pExe );
}
namespace {
class FileEmitContext :
public EmitContext
{
oslFileHandle m_aHandle;
oslFileHandle m_aReadHandle;
unsigned int m_nReadLen;
void openReadFile(
const char * pOrigName );
public :
FileEmitContext(
const char * pFileName,
const char * pOrigName,
const PDFContainer* pTop );
virtual ~FileEmitContext() override;
virtual bool write(
const void * pBuf,
unsigned int nLen ) noexcept override;
virtual unsigned int getCurPos() noexcept override;
virtual bool copyOrigBytes(
unsigned int nOrigOffset,
unsigned int nLen ) noexcept override
;
virtual unsigned int readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void * pBuf ) noexcept override;
};
}
FileEmitContext::FileEmitContext( const char * pFileName, const char * pOrigName, const PDFContainer* pTop )
: EmitContext( pTop ),
m_aHandle( nullptr ),
m_aReadHandle( nullptr ),
m_nReadLen( 0 )
{
OUString aSysFile(
OStringToOUString( std::string_view( pFileName ), osl_getThreadTextEncoding() ) );
OUString aURL;
if ( osl_getFileURLFromSystemPath( aSysFile.pData, &aURL.pData ) != osl_File_E_None )
{
fprintf( stderr, "filename conversion \" %s\" failed\n" , pFileName );
return ;
}
if ( osl_openFile( aURL.pData, &m_aHandle, osl_File_OpenFlag_Write ) == osl_File_E_None )
{
if ( osl_setFileSize( m_aHandle, 0 ) != osl_File_E_None )
{
fprintf( stderr, "could not truncate %s\n" , pFileName );
osl_closeFile( m_aHandle );
m_aHandle = nullptr;
}
}
else if ( osl_openFile( aURL.pData, &m_aHandle,
osl_File_OpenFlag_Write |osl_File_OpenFlag_Create ) != osl_File_E_None )
{
fprintf( stderr, "could not open %s\n" , pFileName );
return ;
}
m_bDeflate = true ;
openReadFile( pOrigName );
}
FileEmitContext::~FileEmitContext()
{
if ( m_aHandle )
osl_closeFile( m_aHandle );
if ( m_aReadHandle )
osl_closeFile( m_aReadHandle );
}
void FileEmitContext::openReadFile( const char * pInFile )
{
OUString aSysFile(
OStringToOUString( std::string_view( pInFile ), osl_getThreadTextEncoding() ) );
OUString aURL;
if ( osl_getFileURLFromSystemPath( aSysFile.pData, &aURL.pData ) != osl_File_E_None )
{
fprintf( stderr, "filename conversion \" %s\" failed\n" , pInFile );
return ;
}
if ( osl_openFile( aURL.pData, &m_aReadHandle, osl_File_OpenFlag_Read ) != osl_File_E_None )
{
fprintf( stderr, "could not open %s\n" , pInFile );
return ;
}
if ( osl_setFilePos( m_aReadHandle, osl_Pos_End, 0 ) != osl_File_E_None )
{
fprintf( stderr, "could not seek to end of %s\n" , pInFile );
osl_closeFile( m_aReadHandle );
return ;
}
sal_uInt64 nFileSize = 0;
if ( osl_getFilePos( m_aReadHandle, &nFileSize ) != osl_File_E_None )
{
fprintf( stderr, "could not get end pos of %s\n" , pInFile );
osl_closeFile( m_aReadHandle );
return ;
}
m_nReadLen = static_cast <unsigned int >(nFileSize);
}
bool FileEmitContext::write( const void * pBuf, unsigned int nLen ) noexcept
{
if ( ! m_aHandle )
return false ;
sal_uInt64 nWrite = static_cast <sal_uInt64>(nLen);
sal_uInt64 nWritten = 0;
return (osl_writeFile( m_aHandle, pBuf, nWrite, &nWritten ) == osl_File_E_None)
&& nWrite == nWritten;
}
unsigned int FileEmitContext::getCurPos() noexcept
{
sal_uInt64 nFileSize = 0;
if ( m_aHandle )
{
if ( osl_getFilePos( m_aHandle, &nFileSize ) != osl_File_E_None )
nFileSize = 0;
}
return static_cast <unsigned int >(nFileSize);
}
bool FileEmitContext::copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) noexcept
{
if ( nOrigOffset + nLen > m_nReadLen )
return false ;
if ( osl_setFilePos( m_aReadHandle, osl_Pos_Absolut, nOrigOffset ) != osl_File_E_None )
{
fprintf( stderr, "could not seek to offset %u\n" , nOrigOffset );
return false ;
}
void * pBuf = std::malloc( nLen );
if ( ! pBuf )
return false ;
sal_uInt64 nBytesRead = 0;
if ( osl_readFile( m_aReadHandle, pBuf, nLen, &nBytesRead ) != osl_File_E_None
|| nBytesRead != static_cast <sal_uInt64>(nLen) )
{
fprintf( stderr, "could not read %u bytes\n" , nLen );
std::free( pBuf );
return false ;
}
bool bRet = write( pBuf, nLen );
std::free( pBuf );
return bRet;
}
unsigned int FileEmitContext::readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void * pBuf ) noexcept
{
if ( nOrigOffset + nLen > m_nReadLen )
return 0;
if ( osl_setFilePos( m_aReadHandle, osl_Pos_Absolut, nOrigOffset ) != osl_File_E_None )
{
fprintf( stderr, "could not seek to offset %u\n" , nOrigOffset );
return 0;
}
sal_uInt64 nBytesRead = 0;
if ( osl_readFile( m_aReadHandle, pBuf, nLen, &nBytesRead ) != osl_File_E_None )
return 0;
return static_cast <unsigned int >(nBytesRead);
}
typedef int (*PDFFileHdl)(const char *, const char *, PDFFile*);
static int handleFile( const char * pInFile, const char * pOutFile, const char * pPassword, PDFFileHdl pHdl )
{
int nRet = 0;
std::unique_ptr<PDFEntry> pEntry
= pdfparse::PDFReader::read(OStringToOUString(pInFile, osl_getThreadTextEncoding()));
if ( pEntry )
{
PDFFile* pPDFFile = dynamic_cast <PDFFile*>(pEntry.get());
if ( pPDFFile )
{
fprintf( stdout, "have a %s PDF file\n" , pPDFFile->isEncrypted() ? "encrypted" : "unencrypted" );
if ( pPassword )
fprintf( stdout, "password %s\n" ,
pPDFFile->setupDecryptionData( pPassword ) ? "matches" : "does not match" );
nRet = pHdl( pInFile, pOutFile, pPDFFile );
}
else
nRet = 20;
}
return nRet;
}
static int write_unzipFile( const char * pInFile, const char * pOutFile, PDFFile* pPDFFile )
{
FileEmitContext aContext( pOutFile, pInFile, pPDFFile );
aContext.m_bDecrypt = pPDFFile->isEncrypted();
pPDFFile->emit(aContext);
return 0;
}
static int write_addStreamArray( const char * pOutFile, PDFArray* pStreams, PDFFile* pPDFFile, const char * pInFile )
{
int nRet = 0;
unsigned int nArrayElements = pStreams->m_aSubElements.size();
for ( unsigned int i = 0; i < nArrayElements-1 && nRet == 0; i++ )
{
PDFName* pMimeType = dynamic_cast <PDFName*>(pStreams->m_aSubElements[i].get());
PDFObjectRef* pStreamRef = dynamic_cast <PDFObjectRef*>(pStreams->m_aSubElements[i+1].get());
if ( ! pMimeType )
fprintf( stderr, "error: no mimetype element\n" );
if ( ! pStreamRef )
fprintf( stderr, "error: no stream ref element\n" );
if ( pMimeType && pStreamRef )
{
fprintf( stdout, "found stream %d %d with mimetype %s\n" ,
pStreamRef->m_nNumber, pStreamRef->m_nGeneration,
pMimeType->m_aName.getStr() );
PDFObject* pObject = pPDFFile->findObject( pStreamRef->m_nNumber, pStreamRef->m_nGeneration );
if ( pObject )
{
OString aOutStream = pOutFile +
OString::Concat("_stream_" ) +
OString::number( sal_Int32(pStreamRef->m_nNumber) ) +
"_" +
OString::number( sal_Int32(pStreamRef->m_nGeneration) );
FileEmitContext aContext( aOutStream.getStr(), pInFile, pPDFFile );
aContext.m_bDecrypt = pPDFFile->isEncrypted();
pObject->writeStream( aContext, pPDFFile );
}
else
{
fprintf( stderr, "object not found\n" );
nRet = 121;
}
}
else
nRet = 120;
}
return nRet;
}
static int write_addStreams( const char * pInFile, const char * pOutFile, PDFFile* pPDFFile )
{
// find all trailers
int nRet = 0;
unsigned int nElements = pPDFFile->m_aSubElements.size();
for ( unsigned i = 0; i < nElements && nRet == 0; i++ )
{
PDFTrailer* pTrailer = dynamic_cast <PDFTrailer*>(pPDFFile->m_aSubElements[i].get());
if ( pTrailer && pTrailer->m_pDict )
{
// search for AdditionalStreams entry
auto add_stream = pTrailer->m_pDict->m_aMap.find( "AdditionalStreams" _ostr );
if ( add_stream != pTrailer->m_pDict->m_aMap.end() )
{
PDFArray* pStreams = dynamic_cast <PDFArray*>(add_stream->second);
if ( pStreams )
nRet = write_addStreamArray( pOutFile, pStreams, pPDFFile, pInFile );
}
}
}
return nRet;
}
static int write_fonts( const char * i_pInFile, const char * i_pOutFile, PDFFile* i_pPDFFile )
{
unsigned int nElements = i_pPDFFile->m_aSubElements.size();
for (unsigned i = 0; i < nElements; i++)
{
// search FontDescriptors
PDFObject* pObj = dynamic_cast <PDFObject*>(i_pPDFFile->m_aSubElements[i].get());
if ( ! pObj )
continue ;
PDFDict* pDict = dynamic_cast <PDFDict*>(pObj->m_pObject);
if ( ! pDict )
continue ;
std::unordered_map<OString,PDFEntry*>::iterator map_it =
pDict->m_aMap.find( "Type" _ostr );
if ( map_it == pDict->m_aMap.end() )
continue ;
PDFName* pName = dynamic_cast <PDFName*>(map_it->second);
if ( ! pName )
continue ;
if ( pName->m_aName != "FontDescriptor" )
continue ;
// the font name will be helpful, also there must be one in
// a font descriptor
map_it = pDict->m_aMap.find( "FontName" _ostr );
if ( map_it == pDict->m_aMap.end() )
continue ;
pName = dynamic_cast <PDFName*>(map_it->second);
if ( ! pName )
continue ;
OString aFontName( pName->m_aName );
PDFObjectRef* pStreamRef = nullptr;
const char * pFileType = nullptr;
// we have a font descriptor, try for a type 1 font
map_it = pDict->m_aMap.find( "FontFile" _ostr );
if ( map_it != pDict->m_aMap.end() )
{
pStreamRef = dynamic_cast <PDFObjectRef*>(map_it->second);
if ( pStreamRef )
pFileType = "pfa" ;
}
// perhaps it's a truetype file ?
if ( ! pStreamRef )
{
map_it = pDict->m_aMap.find( "FontFile2" _ostr );
if ( map_it != pDict->m_aMap.end() )
{
pStreamRef = dynamic_cast <PDFObjectRef*>(map_it->second);
if ( pStreamRef )
pFileType = "ttf" ;
}
}
if ( ! pStreamRef )
continue ;
PDFObject* pStream = i_pPDFFile->findObject( pStreamRef );
if ( ! pStream )
continue ;
OStringBuffer aOutStream( OString::Concat(i_pOutFile)
+ "_font_"
+ OString::number( sal_Int32(pStreamRef->m_nNumber) )
+ "_"
+ OString::number( sal_Int32(pStreamRef->m_nGeneration) )
+ "_"
+ aFontName );
if ( pFileType )
{
aOutStream.append( OString::Concat("." ) + pFileType );
}
FileEmitContext aContext( aOutStream.getStr(), i_pInFile, i_pPDFFile );
aContext.m_bDecrypt = i_pPDFFile->isEncrypted();
pStream->writeStream( aContext, i_pPDFFile );
}
return 0;
}
static std::vector< std::pair< sal_Int32, sal_Int32 > > s_aEmitObjects;
static int write_objects( const char * i_pInFile, const char * i_pOutFile, PDFFile* i_pPDFFile )
{
unsigned int nElements = s_aEmitObjects.size();
for (unsigned i = 0; i < nElements; i++)
{
sal_Int32 nObject = s_aEmitObjects[i].first;
sal_Int32 nGeneration = s_aEmitObjects[i].second;
PDFObject* pStream = i_pPDFFile->findObject( nObject, nGeneration );
if ( ! pStream )
{
fprintf( stderr, "object %d %d not found !\n" , static_cast <int >(nObject), static_cast <int >(nGeneration) );
continue ;
}
OString aOutStream = i_pOutFile +
OString::Concat("_stream_" ) +
OString::number( nObject ) +
"_" +
OString::number( nGeneration );
FileEmitContext aContext( aOutStream.getStr(), i_pInFile, i_pPDFFile );
aContext.m_bDecrypt = i_pPDFFile->isEncrypted();
pStream->writeStream( aContext, i_pPDFFile );
}
return 0;
}
SAL_IMPLEMENT_MAIN_WITH_ARGS( argc, argv )
{
const char * pInFile = nullptr;
const char * pOutFile = nullptr;
const char * pPassword = nullptr;
OStringBuffer aOutFile( 256 );
PDFFileHdl aHdl = write_unzipFile;
for ( int nArg = 1; nArg < argc; nArg++ )
{
if ( argv[nArg][0] == '-' )
{
if ( ! rtl_str_compare( "-pw" , argv[nArg] ) ||
! rtl_str_compare( "--password" , argv[nArg] ) )
{
if ( nArg == argc-1 )
{
fprintf( stderr, "no password given\n" );
return 1;
}
nArg++;
pPassword = argv[nArg];
}
else if ( ! rtl_str_compare( "-h" , argv[nArg] ) ||
! rtl_str_compare( "--help" , argv[nArg] ) )
{
printHelp( argv[0] );
return 0;
}
else if ( ! rtl_str_compare( "-a" , argv[nArg] ) ||
! rtl_str_compare( "--extract-add-streams" , argv[nArg] ) )
{
aHdl = write_addStreams;
}
else if ( ! rtl_str_compare( "-f" , argv[nArg] ) ||
! rtl_str_compare( "--extract-fonts" , argv[nArg] ) )
{
aHdl = write_fonts;
}
else if ( ! rtl_str_compare( "-o" , argv[nArg] ) ||
! rtl_str_compare( "--extract-objects" , argv[nArg] ) )
{
aHdl = write_objects;
nArg++;
if ( nArg < argc )
{
OString aObjs( argv[nArg] );
sal_Int32 nIndex = 0;
while ( nIndex != -1 )
{
OString aToken( aObjs.getToken( 0, ',' , nIndex ) );
sal_Int32 nObject = 0;
sal_Int32 nGeneration = 0;
sal_Int32 nGenIndex = 0;
nObject = o3tl::toInt32( o3tl::getToken( aToken, 0, ':' , nGenIndex ) );
if ( nGenIndex != -1 )
nGeneration = o3tl::toInt32( o3tl::getToken(aToken, 0, ':' , nGenIndex ));
s_aEmitObjects.push_back( std::pair<sal_Int32,sal_Int32>(nObject,nGeneration) );
}
}
}
else
{
fprintf( stderr, "unrecognized option \" %s\"\n" ,
argv[nArg] );
printHelp( argv[0] );
return 1;
}
}
else if ( pInFile == nullptr )
pInFile = argv[nArg];
else if ( pOutFile == nullptr )
pOutFile = argv[nArg];
}
if ( ! pInFile )
{
fprintf( stderr, "no input file given\n" );
return 10;
}
if ( ! pOutFile )
{
OString aFile( pInFile );
if ( aFile.getLength() > 0 )
{
if ( aFile.getLength() > 4 )
{
if ( aFile.matchIgnoreAsciiCase( ".pdf" , aFile.getLength()-4 ) )
aOutFile.append( pInFile, aFile.getLength() - 4 );
else
aOutFile.append( aFile );
}
aOutFile.append( "_unzip.pdf" );
pOutFile = aOutFile.getStr();
}
else
{
fprintf( stderr, "no output file given\n" );
return 11;
}
}
return handleFile( pInFile, pOutFile, pPassword, aHdl );
}
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Messung V0.5 C=96 H=98 G=96
¤ Dauer der Verarbeitung: 0.16 Sekunden
(vorverarbeitet)
¤
*© Formatika GbR, Deutschland