B
    |b7                 @   s  d Z ddlZddlZddlZddlZddlmZmZm	Z	m
Z
mZmZmZmZmZ ddlmZ ddlZddlmZmZmZ ddlmZ ddlmZ ddlmZmZ dd	lmZmZm Z m!Z! dd
l"m#Z#m$Z$m%Z% ddl&m'Z' e(  e)e*Z+e,dZ-ee.e/f e.dddZ0d-ee1ee. ddddZ2d.eee3ddddZ4d/eeee. e3ddddZ5d0ee.eee6 e.e3ee. ee. dd	ddZ7e%dZ8e%dZ9e.e.e.dd d!d"Z:d1ee.e
e6 ee6 e.e3ee. ee. e3dd#
d$d%Z;ed&d'd(Z<d2eee.  dd)d*d+Z=e*d,kre=  dS )3z#Extract pdf structure in XML format    N)	Any	ContainerDictIterableListOptionalTextIOUnioncast)ArgumentParser)PDFDocumentPDFNoOutlinesPDFXRefFallback)PDFPage)	PDFParser)PDFObjectNotFoundPDFValueError)	PDFStream	PDFObjRefresolve1stream_value)	PSKeyword	PSLiteralLIT)isnumberz&[\000-\037&<>()"\042\047\134\177-\377])sreturnc             C   s*   t | trt| d}n| }tdd |S )Nzlatin-1c             S   s   dt | d S )Nz&#%d;r   )ordgroup)m r    W/home/ankuromar296_gmail_com/.local/lib/python3.7/site-packages/../../../bin/dumppdf.py<lambda>       zescape.<locals>.<lambda>)
isinstancebytesstrESC_PATsub)r   usr    r    r!   escape   s    
r*   )outobjcodecr   c             C   s  |d kr|  d d S t|tr|  dt|  x@| D ]4\}}|  d|  |  d t| | |  d q<W |  d d S t|tr|  dt|  x |D ]}t| | |  d qW |  d	 d S t|ttfr|  d
t|t	|f  d S t|t
r|dkr |  |  np|dkr:|  |  nV|  d t| |j |  d |dkr| }|  dt|t	|f  |  d d S t|tr|  d|j  d S t|tr|  d|j  d S t|tr|  d|j  d S t|r|  d|  d S t|d S )Nz<null />z<dict size="%d">
z<key>%s</key>
z<value>z	</value>
z</dict>z<list size="%d">

z</list>z<string size="%d">%s</string>rawbinaryz<stream>
<props>
z

</props>
textz<data size="%d">%s</data>
z	</stream>z<ref id="%d" />z<keyword>%s</keyword>z<literal>%s</literal>z<number>%s</number>)writer$   dictlenitemsdumpxmllistr&   r%   r*   r   Zget_rawdataget_dataattrsr   objidr   namer   r   	TypeError)r+   r,   r-   kvdatar    r    r!   r6   !   s`    















r6   F)r+   docshow_fallback_xrefr   c             C   sn   x>|j D ]4}t|tr|r| d t| |  | d qW tdd |j D }|rj|sjd}t| d S )Nz
<trailer>
z
</trailer>

c             s   s   | ]}t |tV  qd S )N)r$   r   ).0xrefr    r    r!   	<genexpr>j   s    zdumptrailers.<locals>.<genexpr>zThis PDF does not have an xref. Use --show-fallback-xref if you want to display the content of a fallback xref that contains all objects.)	xrefsr$   r   r2   r6   Zget_trailerallloggerwarning)r+   r@   rA   rC   Zno_xrefsmsgr    r    r!   dumptrailersb   s    

rJ   )r+   r@   r-   rA   r   c       	      C   s   t  }| d x|jD ]}x| D ]}||kr4q&|| y>||}|d krTw&| d|  t| ||d | d W q& tk
r } ztd|  W d d }~X Y q&X q&W qW t	| || | d d S )Nz<pdf>z<object id="%d">
)r-   z
</object>

znot found: %rz</pdf>)
setr2   rE   
get_objidsaddgetobjr6   r   printrJ   )	r+   r@   r-   rA   visitedrC   r:   r,   er    r    r!   dumpallobjsu   s&    


&
rR    )	outfpfnameobjidspagenospassworddumpallr-   
extractdirr   c                s  t |d}t|}	t|	| dd tt dD }
ttd fdd}y  }| 	d x|D ]\}}}}}d }|r||}|
|d	 j
 }nP|r|}t|tr|d
}|rt|dkr|dr||d }|
|d	 j
 }t|}| 	d|| |d k	r&| 	d t| | | 	d |d k	r>| 	d|  | 	d qfW | 	d W n tk
rn   Y nX |	  |  d S )Nrbc             S   s   i | ]\}}||j qS r    )Zpageid)rB   pagenopager    r    r!   
<dictcomp>   s   zdumpoutline.<locals>.<dictcomp>   )destr   c                s`   t | ttfrt | } nt | tr8t | j} t | trJ| d } t | tr\| 	 } | S )ND)
r$   r&   r%   r   Zget_destr   r;   r3   r   resolve)r`   )r@   r    r!   resolve_dest   s    


z!dumpoutline.<locals>.resolve_destz<outlines>
r   Sz/'GoTo'ra   z"<outline level="{!r}" title="{}">
z<dest>z</dest>
z<pageno>%r</pageno>
z</outline>
z</outlines>
)openr   r   	enumerater   create_pagesobjectr   Zget_outlinesr2   r:   r$   r3   getreprr*   formatr6   r   close)rT   rU   rV   rW   rX   rY   r-   rZ   fpparserZpagesrc   Zoutlinesleveltitler`   aser\   actionsubtyper   r    )r@   r!   dumpoutline   sH    










ru   ZFilespecZEmbeddedFile)rU   rX   rZ   r   c       
   	      s   t tttf d d fdd}t| d~}t|}t|| t }x^ jD ]T}xN|	 D ]B} 
|}	||krZt|	trZ|	dtkrZ|| |||	 qZW qLW W d Q R X d S )N)r:   r,   r   c                s   t j|dp"tt|d }|d dp@|d d} |j}t	|t
shd| }t||dtk	rtd| t jd| |f }t j|rtd| td	|  t jt j|d
d t|d}||  |  d S )NZUFFZEFz:unable to process PDF: reference for %r is not a PDFStreamTypez>unable to process PDF: reference for %r is not an EmbeddedFilez%.6d-%szfile exists: %rzextracting: %rT)exist_okwb)ospathbasenameri   r
   r%   decoderN   r:   r$   r   r   LITERAL_EMBEDDEDFILEjoinexistsIOErrorrO   makedirsdirnamere   r2   r8   rl   )r:   r,   filenameZfilereffileobj	error_msgr{   r+   )r@   rZ   r    r!   extract1   s(    &

z!extractembedded.<locals>.extract1r[   rw   )intr   r&   r   re   r   r   rK   rE   rL   rN   r$   r3   ri   LITERAL_FILESPECrM   )
rU   rX   rZ   r   rm   rn   Zextracted_objidsrC   r:   r,   r    )r@   rZ   r!   extractembedded   s     



r   )
rT   rU   rV   rW   rX   rY   r-   rZ   rA   r   c	             C   s   t |d}	t|	}
t|
|}|rFx$|D ]}||}t| ||d q&W |rxZtt|D ]H\}}||krZ|rx2|jD ]}t	|}t| ||d qvW qZt| |j
 qZW |rt| ||| |s|s|st| || |	  |dkr| d d S )Nr[   )r-   )r/   r0   r.   )re   r   r   rN   r6   rf   r   rg   contentsr   r9   rR   rJ   rl   r2   )rT   rU   rV   rW   rX   rY   r-   rZ   rA   rm   rn   r@   r:   r,   r\   r]   r    r    r!   dumppdf   s.    




r   )r   c              C   sd  t tdd} | jdtd ddd | jddd	d
tjd | jdddddd |  }|jdddddd |jddtdd | jddd}|jdt	d ddd |jddtd d |jd!d"td#d |jd$d%ddd&d |jd'dd(d) |jd*d+td,d-d. | jd/d0d}|jd1d2td3d4d. | }|jd5d6ddd7d |jd8d9ddd:d |jd;d<ddd=d | S )>NT)descriptionadd_helpfiles+zOne or more paths to PDF files.)typedefaultnargshelpz	--versionz-vversionzpdfminer.six v{})rs   r   z--debugz-dF
store_truezUse debug logging level.)r   rs   r   z--extract-tocz-TzExtract structure of outlinez--extract-embeddedz-EzExtract embedded files)r   r   ParserzUsed during PDF parsing)r   z--page-numbersz0A space-seperated list of page numbers to parse.z	--pagenosz-pzA comma-separated list of page numbers to parse. Included for legacy applications, use --page-numbers for more idiomatic argument entry.z	--objectsz-iz1Comma separated list of object numbers to extractz--allz-az3If the structure of all objects should be extractedz--show-fallback-xrefzAdditionally show the fallback xref. Use this if the PDF has zero or only invalid xref's. This setting is ignored if --extract-toc or --extract-embedded is used.)rs   r   z
--passwordz-PrS   z,The password to use for decrypting PDF file.)r   r   r   OutputzUsed during output generation.z	--outfilez-o-zJPath to file where output is written. Or "-" (default) to write to stdout.z--raw-streamz-rz%Write stream objects without encodingz--binary-streamz-bz)Write stream objects with binary encodingz--text-streamz-tz"Write stream objects as plain text)
r   __doc__add_argumentr&   rk   pdfminer__version__add_mutually_exclusive_groupadd_argument_groupr   )rn   Zprocedure_parserZparse_paramsZoutput_paramsZcodec_parserr    r    r!   create_parser!  s    

r   )argvr   c       	      C   sJ  t  }|j| d}|jr(t tj |jdkr:tj	}nt
|jd}|jrddd |jdD }ng }|jrdd |jD }n$|jrd	d |jdD }nt }|j}|jrd
}n|jrd}n|jrd}nd }xj|jD ]`}|jrt||||||j|d d q|jrt|||jd qt||||||j|d |jd	 qW |  d S )N)argsr   wc             S   s   g | ]}t |qS r    )r   )rB   xr    r    r!   
<listcomp>  s    zmain.<locals>.<listcomp>,c             S   s   h | ]}|d  qS )r_   r    )rB   r   r    r    r!   	<setcomp>  s    zmain.<locals>.<setcomp>c             S   s   h | ]}t |d  qS )r_   )r   )rB   r   r    r    r!   r     s    r/   r0   r1   )rX   rY   r-   rZ   )rX   rZ   )rX   rY   r-   rZ   rA   )r   
parse_argsdebuglogging	getLoggersetLevelDEBUGoutfilesysstdoutre   objectssplitZpage_numbersrW   rK   rX   Z
raw_streamZbinary_streamZtext_streamr   Zextract_tocru   rF   Zextract_embeddedr   r   rA   rl   )	r   rn   r   rT   rV   rW   rX   r-   rU   r    r    r!   main  s^    

r   __main__)N)F)NF)rS   FNN)rS   FNNF)N)>r   r   os.pathrz   rer   typingr   r   r   r   r   r   r   r	   r
   argparser   r   Zpdfminer.pdfdocumentr   r   r   Zpdfminer.pdfpager   Zpdfminer.pdfparserr   Zpdfminer.pdftypesr   r   r   r   r   r   Zpdfminer.psparserr   r   r   Zpdfminer.utilsr   basicConfigr   __name__rG   compiler'   r&   r%   r*   rh   r6   boolrJ   rR   r   ru   r   r~   r   r   r   r   r    r    r    r!   <module>   sT   ,

B    40    $vA
