feat: Add new gcloud commands, API clients, and third-party libraries across various services.

This commit is contained in:
2026-01-01 20:26:35 +01:00
parent 5e23cbece0
commit a19e592eb7
25221 changed files with 8324611 additions and 0 deletions

View File

@@ -0,0 +1,177 @@
# Makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
PAPER =
BUILDDIR = _build
# User-friendly check for sphinx-build
ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
endif
# Internal variables.
PAPEROPT_a4 = -D latex_paper_size=a4
PAPEROPT_letter = -D latex_paper_size=letter
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
# the i18n builder cannot share the environment and doctrees with the others
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
help:
@echo "Please use \`make <target>' where <target> is one of"
@echo " html to make standalone HTML files"
@echo " dirhtml to make HTML files named index.html in directories"
@echo " singlehtml to make a single large HTML file"
@echo " pickle to make pickle files"
@echo " json to make JSON files"
@echo " htmlhelp to make HTML files and a HTML help project"
@echo " qthelp to make HTML files and a qthelp project"
@echo " devhelp to make HTML files and a Devhelp project"
@echo " epub to make an epub"
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
@echo " latexpdf to make LaTeX files and run them through pdflatex"
@echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
@echo " text to make text files"
@echo " man to make manual pages"
@echo " texinfo to make Texinfo files"
@echo " info to make Texinfo files and run them through makeinfo"
@echo " gettext to make PO message catalogs"
@echo " changes to make an overview of all changed/added/deprecated items"
@echo " xml to make Docutils-native XML files"
@echo " pseudoxml to make pseudoxml-XML files for display purposes"
@echo " linkcheck to check all external links for integrity"
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
clean:
rm -rf $(BUILDDIR)/*
html:
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
dirhtml:
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
singlehtml:
$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
@echo
@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
pickle:
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
@echo
@echo "Build finished; now you can process the pickle files."
json:
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
@echo
@echo "Build finished; now you can process the JSON files."
htmlhelp:
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
@echo
@echo "Build finished; now you can run HTML Help Workshop with the" \
".hhp project file in $(BUILDDIR)/htmlhelp."
qthelp:
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
@echo
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/chardet.qhcp"
@echo "To view the help file:"
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/chardet.qhc"
devhelp:
$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
@echo
@echo "Build finished."
@echo "To view the help file:"
@echo "# mkdir -p $$HOME/.local/share/devhelp/chardet"
@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/chardet"
@echo "# devhelp"
epub:
$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
@echo
@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
latex:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
@echo "Run \`make' in that directory to run these through (pdf)latex" \
"(use \`make latexpdf' here to do that automatically)."
latexpdf:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through pdflatex..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
latexpdfja:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through platex and dvipdfmx..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
text:
$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
@echo
@echo "Build finished. The text files are in $(BUILDDIR)/text."
man:
$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
@echo
@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
texinfo:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo
@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
@echo "Run \`make' in that directory to run these through makeinfo" \
"(use \`make info' here to do that automatically)."
info:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo "Running Texinfo files through makeinfo..."
make -C $(BUILDDIR)/texinfo info
@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
gettext:
$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
@echo
@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
changes:
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
@echo
@echo "The overview file is in $(BUILDDIR)/changes."
linkcheck:
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
@echo
@echo "Link check complete; look for any errors in the above output " \
"or in $(BUILDDIR)/linkcheck/output.txt."
doctest:
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
@echo "Testing of doctests in the sources finished, look at the " \
"results in $(BUILDDIR)/doctest/output.txt."
xml:
$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
@echo
@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
pseudoxml:
$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
@echo
@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."

View File

@@ -0,0 +1,3 @@
For users, docs are now available at https://chardet.readthedocs.io/.
For devs, you can edit the RST files in this directory.

View File

@@ -0,0 +1,310 @@
chardet package
===============
Submodules
----------
chardet.big5freq module
-----------------------
.. automodule:: chardet.big5freq
:members:
:undoc-members:
:show-inheritance:
chardet.big5prober module
-------------------------
.. automodule:: chardet.big5prober
:members:
:undoc-members:
:show-inheritance:
chardet.chardetect module
-------------------------
.. automodule:: chardet.chardetect
:members:
:undoc-members:
:show-inheritance:
chardet.chardistribution module
-------------------------------
.. automodule:: chardet.chardistribution
:members:
:undoc-members:
:show-inheritance:
chardet.charsetgroupprober module
---------------------------------
.. automodule:: chardet.charsetgroupprober
:members:
:undoc-members:
:show-inheritance:
chardet.charsetprober module
----------------------------
.. automodule:: chardet.charsetprober
:members:
:undoc-members:
:show-inheritance:
chardet.codingstatemachine module
---------------------------------
.. automodule:: chardet.codingstatemachine
:members:
:undoc-members:
:show-inheritance:
chardet.compat module
---------------------
.. automodule:: chardet.compat
:members:
:undoc-members:
:show-inheritance:
chardet.constants module
------------------------
.. automodule:: chardet.constants
:members:
:undoc-members:
:show-inheritance:
chardet.cp949prober module
--------------------------
.. automodule:: chardet.cp949prober
:members:
:undoc-members:
:show-inheritance:
chardet.escprober module
------------------------
.. automodule:: chardet.escprober
:members:
:undoc-members:
:show-inheritance:
chardet.escsm module
--------------------
.. automodule:: chardet.escsm
:members:
:undoc-members:
:show-inheritance:
chardet.eucjpprober module
--------------------------
.. automodule:: chardet.eucjpprober
:members:
:undoc-members:
:show-inheritance:
chardet.euckrfreq module
------------------------
.. automodule:: chardet.euckrfreq
:members:
:undoc-members:
:show-inheritance:
chardet.euckrprober module
--------------------------
.. automodule:: chardet.euckrprober
:members:
:undoc-members:
:show-inheritance:
chardet.euctwfreq module
------------------------
.. automodule:: chardet.euctwfreq
:members:
:undoc-members:
:show-inheritance:
chardet.euctwprober module
--------------------------
.. automodule:: chardet.euctwprober
:members:
:undoc-members:
:show-inheritance:
chardet.gb2312freq module
-------------------------
.. automodule:: chardet.gb2312freq
:members:
:undoc-members:
:show-inheritance:
chardet.gb2312prober module
---------------------------
.. automodule:: chardet.gb2312prober
:members:
:undoc-members:
:show-inheritance:
chardet.hebrewprober module
---------------------------
.. automodule:: chardet.hebrewprober
:members:
:undoc-members:
:show-inheritance:
chardet.jisfreq module
----------------------
.. automodule:: chardet.jisfreq
:members:
:undoc-members:
:show-inheritance:
chardet.jpcntx module
---------------------
.. automodule:: chardet.jpcntx
:members:
:undoc-members:
:show-inheritance:
chardet.langbulgarianmodel module
---------------------------------
.. automodule:: chardet.langbulgarianmodel
:members:
:undoc-members:
:show-inheritance:
chardet.langcyrillicmodel module
--------------------------------
.. automodule:: chardet.langcyrillicmodel
:members:
:undoc-members:
:show-inheritance:
chardet.langgreekmodel module
-----------------------------
.. automodule:: chardet.langgreekmodel
:members:
:undoc-members:
:show-inheritance:
chardet.langhebrewmodel module
------------------------------
.. automodule:: chardet.langhebrewmodel
:members:
:undoc-members:
:show-inheritance:
chardet.langhungarianmodel module
---------------------------------
.. automodule:: chardet.langhungarianmodel
:members:
:undoc-members:
:show-inheritance:
chardet.langthaimodel module
----------------------------
.. automodule:: chardet.langthaimodel
:members:
:undoc-members:
:show-inheritance:
chardet.latin1prober module
---------------------------
.. automodule:: chardet.latin1prober
:members:
:undoc-members:
:show-inheritance:
chardet.mbcharsetprober module
------------------------------
.. automodule:: chardet.mbcharsetprober
:members:
:undoc-members:
:show-inheritance:
chardet.mbcsgroupprober module
------------------------------
.. automodule:: chardet.mbcsgroupprober
:members:
:undoc-members:
:show-inheritance:
chardet.mbcssm module
---------------------
.. automodule:: chardet.mbcssm
:members:
:undoc-members:
:show-inheritance:
chardet.sbcharsetprober module
------------------------------
.. automodule:: chardet.sbcharsetprober
:members:
:undoc-members:
:show-inheritance:
chardet.sbcsgroupprober module
------------------------------
.. automodule:: chardet.sbcsgroupprober
:members:
:undoc-members:
:show-inheritance:
chardet.sjisprober module
-------------------------
.. automodule:: chardet.sjisprober
:members:
:undoc-members:
:show-inheritance:
chardet.universaldetector module
--------------------------------
.. automodule:: chardet.universaldetector
:members:
:undoc-members:
:show-inheritance:
chardet.utf8prober module
-------------------------
.. automodule:: chardet.utf8prober
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: chardet
:members:
:undoc-members:
:show-inheritance:

View File

@@ -0,0 +1,7 @@
chardet
=======
.. toctree::
:maxdepth: 4
chardet

View File

@@ -0,0 +1,276 @@
# chardet documentation build configuration file, created by
# sphinx-quickstart on Thu Mar 27 00:17:49 2015.
#
# This file is execfile()d with the current directory set to its
# containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
import os
import chardet
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
# sys.path.insert(0, os.path.abspath('.'))
# -- General configuration ------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = ["sphinx.ext.autodoc", "sphinx.ext.viewcode", "sphinx.ext.mathjax"]
# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]
# The suffix of source filenames.
source_suffix = ".rst"
# The encoding of source files.
# source_encoding = 'utf-8-sig'
# The master toctree document.
master_doc = "index"
# General information about the project.
project = "chardet"
copyright = "2015, Mark Pilgrim, Dan Blanchard, Ian Cordasco" # pylint: disable=redefined-builtin
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
version = chardet.__version__
# The full version, including alpha/beta/rc tags.
release = chardet.__version__
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
# language = None
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
# today = ''
# Else, today_fmt is used as the format for a strftime call.
# today_fmt = '%B %d, %Y'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
exclude_patterns = ["_build"]
# The reST default role (used for this markup: `text`) to use for all
# documents.
# default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
# add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
# add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
# show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = "sphinx"
# A list of ignored prefixes for module index sorting.
# modindex_common_prefix = []
# If true, keep warnings as "system message" paragraphs in the built documents.
# keep_warnings = False
# -- Options for HTML output ----------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
on_rtd = os.environ.get("READTHEDOCS", None) == "True"
if not on_rtd:
import sphinx_rtd_theme # pylint: disable=import-error
html_theme = "sphinx_rtd_theme"
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
# html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
# html_theme_path = []
# The name for this set of Sphinx documents. If None, it defaults to
# "<project> v<release> documentation".
# html_title = None
# A shorter title for the navigation bar. Default is the same as html_title.
# html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
# html_logo = None
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
# html_favicon = None
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ["_static"]
# Add any extra paths that contain custom files (such as robots.txt or
# .htaccess) here, relative to this directory. These files are copied
# directly to the root of the documentation.
# html_extra_path = []
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
# html_last_updated_fmt = '%b %d, %Y'
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
# html_use_smartypants = True
# Custom sidebar templates, maps document names to template names.
# html_sidebars = {}
# Additional templates that should be rendered to pages, maps page names to
# template names.
# html_additional_pages = {}
# If false, no module index is generated.
# html_domain_indices = True
# If false, no index is generated.
# html_use_index = True
# If true, the index is split into individual pages for each letter.
# html_split_index = False
# If true, links to the reST sources are added to the pages.
# html_show_sourcelink = True
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
# html_show_sphinx = True
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
# html_show_copyright = True
# If true, an OpenSearch description file will be output, and all pages will
# contain a <link> tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
# html_use_opensearch = ''
# This is the file name suffix for HTML files (e.g. ".xhtml").
# html_file_suffix = None
# Output file base name for HTML help builder.
htmlhelp_basename = "chardetdoc"
# -- Options for LaTeX output ---------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#'preamble': '',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
(
"index",
"chardet.tex",
"chardet Documentation",
"Mark Pilgrim, Dan Blanchard, Ian Cordasco",
"manual",
),
]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
# latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
# latex_use_parts = False
# If true, show page references after internal links.
# latex_show_pagerefs = False
# If true, show URL addresses after external links.
# latex_show_urls = False
# Documents to append as an appendix to all manuals.
# latex_appendices = []
# If false, no module index is generated.
# latex_domain_indices = True
# -- Options for manual page output ---------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
(
"index",
"chardet",
"chardet Documentation",
["Mark Pilgrim, Dan Blanchard, Ian Cordasco"],
1,
)
]
# If true, show URL addresses after external links.
# man_show_urls = False
# -- Options for Texinfo output -------------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
(
"index",
"chardet",
"chardet Documentation",
"Mark Pilgrim, Dan Blanchard, Ian Cordasco",
"chardet",
"Universal charset detector.",
"Miscellaneous",
),
]
# Documents to append as an appendix to all manuals.
# texinfo_appendices = []
# If false, no module index is generated.
# texinfo_domain_indices = True
# How to display URL addresses: 'footnote', 'no', or 'inline'.
# texinfo_show_urls = 'footnote'
# If true, do not generate a @detailmenu in the "Top" node's menu.
# texinfo_no_detailmenu = False

View File

@@ -0,0 +1,105 @@
Frequently asked questions
==========================
What is character encoding?
---------------------------
When you think of “text”, you probably think of “characters and symbols
I see on my computer screen”. But computers dont deal in characters and
symbols; they deal in bits and bytes. Every piece of text youve ever
seen on a computer screen is actually stored in a particular *character
encoding*. There are many different character encodings, some optimized
for particular languages like Russian or Chinese or English, and others
that can be used for multiple languages. Very roughly speaking, the
character encoding provides a mapping between the stuff you see on your
screen and the stuff your computer actually stores in memory and on
disk.
In reality, its more complicated than that. Many characters are common
to multiple encodings, but each encoding may use a different sequence of
bytes to actually store those characters in memory or on disk. So you
can think of the character encoding as a kind of decryption key for the
text. Whenever someone gives you a sequence of bytes and claims its
“text”, you need to know what character encoding they used so you can
decode the bytes into characters and display them (or process them, or
whatever).
What is character encoding auto-detection?
------------------------------------------
It means taking a sequence of bytes in an unknown character encoding,
and attempting to determine the encoding so you can read the text. Its
like cracking a code when you dont have the decryption key.
Isnt that impossible?
----------------------
In general, yes. However, some encodings are optimized for specific
languages, and languages are not random. Some character sequences pop up
all the time, while other sequences make no sense. A person fluent in
English who opens a newspaper and finds “txzqJv 2!dasd0a QqdKjvz” will
instantly recognize that that isnt English (even though it is composed
entirely of English letters). By studying lots of “typical” text, a
computer algorithm can simulate this kind of fluency and make an
educated guess about a texts language.
In other words, encoding detection is really language detection,
combined with knowledge of which languages tend to use which character
encodings.
Who wrote this detection algorithm?
-----------------------------------
This library is a port of `the auto-detection code in
Mozilla <https://www-archive.mozilla.org/projects/intl/chardet.html>`__.
I have attempted to maintain as much of the original structure as
possible (mostly for selfish reasons, to make it easier to maintain the
port as the original code evolves). I have also retained the original
authors comments, which are quite extensive and informative.
You may also be interested in the research paper which led to the
Mozilla implementation, `A composite approach to language/encoding
detection <http://www-archive.mozilla.org/projects/intl/UniversalCharsetDetection.html>`__.
Yippie! Screw the standards, Ill just auto-detect everything!
--------------------------------------------------------------
Dont do that. Virtually every format and protocol contains a method for
specifying character encoding.
- HTTP can define a ``charset`` parameter in the ``Content-type``
header.
- HTML documents can define a ``<meta http-equiv="content-type">``
element in the ``<head>`` of a web page.
- XML documents can define an ``encoding`` attribute in the XML prolog.
If text comes with explicit character encoding information, you should
use it. If the text has no explicit information, but the relevant
standard defines a default encoding, you should use that. (This is
harder than it sounds, because standards can overlap. If you fetch an
XML document over HTTP, you need to support both standards *and* figure
out which one wins if they give you conflicting information.)
Despite the complexity, its worthwhile to follow standards and `respect
explicit character encoding
information <http://www.w3.org/2001/tag/doc/mime-respect>`__. It will
almost certainly be faster and more accurate than trying to auto-detect
the encoding. It will also make the world a better place, since your
program will interoperate with other programs that follow the same
standards.
Why bother with auto-detection if its slow, inaccurate, and non-standard?
--------------------------------------------------------------------------
Sometimes you receive text with verifiably inaccurate encoding
information. Or text without any encoding information, and the specified
default encoding doesnt work. There are also some poorly designed
standards that have no way to specify encoding at all.
If following the relevant standards gets you nowhere, *and* you decide
that processing the text is more important than maintaining
interoperability, then you can try to auto-detect the character encoding
as a last resort. An example is my `Universal Feed
Parser <https://pythonhosted.org/feedparser/>`__, which calls this auto-detection
library `only after exhausting all other
options <https://pythonhosted.org/feedparser/character-encoding.html>`__.

View File

@@ -0,0 +1,164 @@
How it works
============
This is a brief guide to navigating the code itself.
First, you should read `A composite approach to language/encoding
detection <https://www-archive.mozilla.org/projects/intl/UniversalCharsetDetection.html>`__,
which explains the detection algorithm and how it was derived. This will
help you later when you stumble across the huge character frequency
distribution tables like ``big5freq.py`` and language models like
``langcyrillicmodel.py``.
The main entry point for the detection algorithm is
``universaldetector.py``, which has one class, ``UniversalDetector``.
(You might think the main entry point is the ``detect`` function in
``chardet/__init__.py``, but thats really just a convenience function
that creates a ``UniversalDetector`` object, calls it, and returns its
result.)
There are 5 categories of encodings that ``UniversalDetector`` handles:
#. ``UTF-n`` with a BOM. This includes ``UTF-8``, both BE and LE
variants of ``UTF-16``, and all 4 byte-order variants of ``UTF-32``.
#. Escaped encodings, which are entirely 7-bit ASCII compatible, where
non-ASCII characters start with an escape sequence. Examples:
``ISO-2022-JP`` (Japanese) and ``HZ-GB-2312`` (Chinese).
#. Multi-byte encodings, where each character is represented by a
variable number of bytes. Examples: ``Big5`` (Chinese), ``SHIFT_JIS``
(Japanese), ``EUC-KR`` (Korean), and ``UTF-8`` without a BOM.
#. Single-byte encodings, where each character is represented by one
byte. Examples: ``KOI8-R`` (Russian), ``windows-1255`` (Hebrew), and
``TIS-620`` (Thai).
#. ``windows-1252``, which is used primarily on Microsoft Windows; its
subset, ``ISO-8859-1`` is widely used for legacy 8-bit-encoded text.
chardet, like many encoding detectors, defaults to guessing this
encoding when no other can be reliably established.
``UTF-n`` with a BOM
--------------------
If the text starts with a BOM, we can reasonably assume that the text is
encoded in ``UTF-8``, ``UTF-16``, or ``UTF-32``. (The BOM will tell us
exactly which one; thats what its for.) This is handled inline in
``UniversalDetector``, which returns the result immediately without any
further processing.
Escaped encodings
-----------------
If the text contains a recognizable escape sequence that might indicate
an escaped encoding, ``UniversalDetector`` creates an
``EscCharSetProber`` (defined in ``escprober.py``) and feeds it the
text.
``EscCharSetProber`` creates a series of state machines, based on models
of ``HZ-GB-2312``, ``ISO-2022-CN``, ``ISO-2022-JP``, and ``ISO-2022-KR``
(defined in ``escsm.py``). ``EscCharSetProber`` feeds the text to each
of these state machines, one byte at a time. If any state machine ends
up uniquely identifying the encoding, ``EscCharSetProber`` immediately
returns the positive result to ``UniversalDetector``, which returns it
to the caller. If any state machine hits an illegal sequence, it is
dropped and processing continues with the other state machines.
Multi-byte encodings
--------------------
Assuming no BOM, ``UniversalDetector`` checks whether the text contains
any high-bit characters. If so, it creates a series of “probers” for
detecting multi-byte encodings, single-byte encodings, and as a last
resort, ``windows-1252``.
The multi-byte encoding prober, ``MBCSGroupProber`` (defined in
``mbcsgroupprober.py``), is really just a shell that manages a group of
other probers, one for each multi-byte encoding: ``Big5``, ``GB2312``,
``EUC-TW``, ``EUC-KR``, ``EUC-JP``, ``SHIFT_JIS``, and ``UTF-8``.
``MBCSGroupProber`` feeds the text to each of these encoding-specific
probers and checks the results. If a prober reports that it has found an
illegal byte sequence, it is dropped from further processing (so that,
for instance, any subsequent calls to ``UniversalDetector``.\ ``feed``
will skip that prober). If a prober reports that it is reasonably
confident that it has detected the encoding, ``MBCSGroupProber`` reports
this positive result to ``UniversalDetector``, which reports the result
to the caller.
Most of the multi-byte encoding probers are inherited from
``MultiByteCharSetProber`` (defined in ``mbcharsetprober.py``), and
simply hook up the appropriate state machine and distribution analyzer
and let ``MultiByteCharSetProber`` do the rest of the work.
``MultiByteCharSetProber`` runs the text through the encoding-specific
state machine, one byte at a time, to look for byte sequences that would
indicate a conclusive positive or negative result. At the same time,
``MultiByteCharSetProber`` feeds the text to an encoding-specific
distribution analyzer.
The distribution analyzers (each defined in ``chardistribution.py``) use
language-specific models of which characters are used most frequently.
Once ``MultiByteCharSetProber`` has fed enough text to the distribution
analyzer, it calculates a confidence rating based on the number of
frequently-used characters, the total number of characters, and a
language-specific distribution ratio. If the confidence is high enough,
``MultiByteCharSetProber`` returns the result to ``MBCSGroupProber``,
which returns it to ``UniversalDetector``, which returns it to the
caller.
The case of Japanese is more difficult. Single-character distribution
analysis is not always sufficient to distinguish between ``EUC-JP`` and
``SHIFT_JIS``, so the ``SJISProber`` (defined in ``sjisprober.py``) also
uses 2-character distribution analysis. ``SJISContextAnalysis`` and
``EUCJPContextAnalysis`` (both defined in ``jpcntx.py`` and both
inheriting from a common ``JapaneseContextAnalysis`` class) check the
frequency of Hiragana syllabary characters within the text. Once enough
text has been processed, they return a confidence level to
``SJISProber``, which checks both analyzers and returns the higher
confidence level to ``MBCSGroupProber``.
Single-byte encodings
---------------------
The single-byte encoding prober, ``SBCSGroupProber`` (defined in
``sbcsgroupprober.py``), is also just a shell that manages a group of
other probers, one for each combination of single-byte encoding and
language: ``windows-1251``, ``KOI8-R``, ``ISO-8859-5``, ``MacCyrillic``,
``IBM855``, and ``IBM866`` (Russian); ``ISO-8859-7`` and
``windows-1253`` (Greek); ``ISO-8859-5`` and ``windows-1251``
(Bulgarian); ``ISO-8859-2`` and ``windows-1250`` (Hungarian);
``TIS-620`` (Thai); ``windows-1255`` and ``ISO-8859-8`` (Hebrew).
``SBCSGroupProber`` feeds the text to each of these
encoding+language-specific probers and checks the results. These probers
are all implemented as a single class, ``SingleByteCharSetProber``
(defined in ``sbcharsetprober.py``), which takes a language model as an
argument. The language model defines how frequently different
2-character sequences appear in typical text.
``SingleByteCharSetProber`` processes the text and tallies the most
frequently used 2-character sequences. Once enough text has been
processed, it calculates a confidence level based on the number of
frequently-used sequences, the total number of characters, and a
language-specific distribution ratio.
Hebrew is handled as a special case. If the text appears to be Hebrew
based on 2-character distribution analysis, ``HebrewProber`` (defined in
``hebrewprober.py``) tries to distinguish between Visual Hebrew (where
the source text actually stored “backwards” line-by-line, and then
displayed verbatim so it can be read from right to left) and Logical
Hebrew (where the source text is stored in reading order and then
rendered right-to-left by the client). Because certain characters are
encoded differently based on whether they appear in the middle of or at
the end of a word, we can make a reasonable guess about direction of the
source text, and return the appropriate encoding (``windows-1255`` for
Logical Hebrew, or ``ISO-8859-8`` for Visual Hebrew).
windows-1252
------------
If ``UniversalDetector`` detects a high-bit character in the text, but
none of the other multi-byte or single-byte encoding probers return a
confident result, it creates a ``Latin1Prober`` (defined in
``latin1prober.py``) to try to detect English text in a ``windows-1252``
encoding. This detection is inherently unreliable, because English
letters are encoded in the same way in many different encodings. The
only way to distinguish ``windows-1252`` is through commonly used
symbols like smart quotes, curly apostrophes, copyright symbols, and the
like. ``Latin1Prober`` automatically reduces its confidence rating to
allow more accurate probers to win if at all possible.

View File

@@ -0,0 +1,25 @@
chardet
=======
Character encoding auto-detection in Python. As smart as your browser.
Open source.
Documentation
=============
.. toctree::
:maxdepth: 2
faq
supported-encodings
usage
how-it-works
api/modules
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`

View File

@@ -0,0 +1,242 @@
@ECHO OFF
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set BUILDDIR=_build
set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
set I18NSPHINXOPTS=%SPHINXOPTS% .
if NOT "%PAPER%" == "" (
set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
)
if "%1" == "" goto help
if "%1" == "help" (
:help
echo.Please use `make ^<target^>` where ^<target^> is one of
echo. html to make standalone HTML files
echo. dirhtml to make HTML files named index.html in directories
echo. singlehtml to make a single large HTML file
echo. pickle to make pickle files
echo. json to make JSON files
echo. htmlhelp to make HTML files and a HTML help project
echo. qthelp to make HTML files and a qthelp project
echo. devhelp to make HTML files and a Devhelp project
echo. epub to make an epub
echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
echo. text to make text files
echo. man to make manual pages
echo. texinfo to make Texinfo files
echo. gettext to make PO message catalogs
echo. changes to make an overview over all changed/added/deprecated items
echo. xml to make Docutils-native XML files
echo. pseudoxml to make pseudoxml-XML files for display purposes
echo. linkcheck to check all external links for integrity
echo. doctest to run all doctests embedded in the documentation if enabled
goto end
)
if "%1" == "clean" (
for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
del /q /s %BUILDDIR%\*
goto end
)
%SPHINXBUILD% 2> nul
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
if "%1" == "html" (
%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%/html.
goto end
)
if "%1" == "dirhtml" (
%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
goto end
)
if "%1" == "singlehtml" (
%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
goto end
)
if "%1" == "pickle" (
%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can process the pickle files.
goto end
)
if "%1" == "json" (
%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can process the JSON files.
goto end
)
if "%1" == "htmlhelp" (
%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can run HTML Help Workshop with the ^
.hhp project file in %BUILDDIR%/htmlhelp.
goto end
)
if "%1" == "qthelp" (
%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can run "qcollectiongenerator" with the ^
.qhcp project file in %BUILDDIR%/qthelp, like this:
echo.^> qcollectiongenerator %BUILDDIR%\qthelp\chardet.qhcp
echo.To view the help file:
echo.^> assistant -collectionFile %BUILDDIR%\qthelp\chardet.ghc
goto end
)
if "%1" == "devhelp" (
%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
if errorlevel 1 exit /b 1
echo.
echo.Build finished.
goto end
)
if "%1" == "epub" (
%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The epub file is in %BUILDDIR%/epub.
goto end
)
if "%1" == "latex" (
%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
if errorlevel 1 exit /b 1
echo.
echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
goto end
)
if "%1" == "latexpdf" (
%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
cd %BUILDDIR%/latex
make all-pdf
cd %BUILDDIR%/..
echo.
echo.Build finished; the PDF files are in %BUILDDIR%/latex.
goto end
)
if "%1" == "latexpdfja" (
%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
cd %BUILDDIR%/latex
make all-pdf-ja
cd %BUILDDIR%/..
echo.
echo.Build finished; the PDF files are in %BUILDDIR%/latex.
goto end
)
if "%1" == "text" (
%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The text files are in %BUILDDIR%/text.
goto end
)
if "%1" == "man" (
%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The manual pages are in %BUILDDIR%/man.
goto end
)
if "%1" == "texinfo" (
%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
goto end
)
if "%1" == "gettext" (
%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
goto end
)
if "%1" == "changes" (
%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
if errorlevel 1 exit /b 1
echo.
echo.The overview file is in %BUILDDIR%/changes.
goto end
)
if "%1" == "linkcheck" (
%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
if errorlevel 1 exit /b 1
echo.
echo.Link check complete; look for any errors in the above output ^
or in %BUILDDIR%/linkcheck/output.txt.
goto end
)
if "%1" == "doctest" (
%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
if errorlevel 1 exit /b 1
echo.
echo.Testing of doctests in the sources finished, look at the ^
results in %BUILDDIR%/doctest/output.txt.
goto end
)
if "%1" == "xml" (
%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The XML files are in %BUILDDIR%/xml.
goto end
)
if "%1" == "pseudoxml" (
%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
goto end
)
:end

View File

@@ -0,0 +1,30 @@
Supported encodings
===================
Universal Encoding Detector currently supports over two dozen character
encodings.
- ``Big5``, ``GB2312``/``GB18030``, ``EUC-TW``, ``HZ-GB-2312``, and
``ISO-2022-CN`` (Traditional and Simplified Chinese)
- ``EUC-JP``, ``SHIFT_JIS``, and ``ISO-2022-JP`` (Japanese)
- ``EUC-KR`` and ``ISO-2022-KR`` (Korean)
- ``KOI8-R``, ``MacCyrillic``, ``IBM855``, ``IBM866``, ``ISO-8859-5``,
and ``windows-1251`` (Russian)
- ``ISO-8859-2`` and ``windows-1250`` (Hungarian)
- ``ISO-8859-5`` and ``windows-1251`` (Bulgarian)
- ``ISO-8859-1`` and ``windows-1252`` (Western European languages)
- ``ISO-8859-7`` and ``windows-1253`` (Greek)
- ``ISO-8859-8`` and ``windows-1255`` (Visual and Logical Hebrew)
- ``TIS-620`` (Thai)
- ``UTF-32`` BE, LE, 3412-ordered, or 2143-ordered (with a BOM)
- ``UTF-16`` BE or LE (with a BOM)
- ``UTF-8`` (with or without a BOM)
- ASCII
.. warning::
Due to inherent similarities between certain encodings, some encodings may
be detected incorrectly. In my tests, the most problematic case was
Hungarian text encoded as ``ISO-8859-2`` or ``windows-1250`` (encoded as
one but reported as the other). Also, Greek text encoded as ``ISO-8859-7``
was often mis-reported as ``ISO-8859-2``. Your mileage may vary.

View File

@@ -0,0 +1,88 @@
Usage
=====
Basic usage
-----------
The easiest way to use the Universal Encoding Detector library is with
the ``detect`` function.
Example: Using the ``detect`` function
--------------------------------------
The ``detect`` function takes one argument, a non-Unicode string. It
returns a dictionary containing the auto-detected character encoding and
a confidence level from ``0`` to ``1``.
.. code:: python
>>> import urllib.request
>>> rawdata = urllib.request.urlopen('http://yahoo.co.jp/').read()
>>> import chardet
>>> chardet.detect(rawdata)
{'encoding': 'EUC-JP', 'confidence': 0.99}
Advanced usage
--------------
If youre dealing with a large amount of text, you can call the
Universal Encoding Detector library incrementally, and it will stop as
soon as it is confident enough to report its results.
Create a ``UniversalDetector`` object, then call its ``feed`` method
repeatedly with each block of text. If the detector reaches a minimum
threshold of confidence, it will set ``detector.done`` to ``True``.
Once youve exhausted the source text, call ``detector.close()``, which
will do some final calculations in case the detector didnt hit its
minimum confidence threshold earlier. Then ``detector.result`` will be a
dictionary containing the auto-detected character encoding and
confidence level (the same as the ``chardet.detect`` function
`returns <usage.html#example-using-the-detect-function>`__).
Example: Detecting encoding incrementally
-----------------------------------------
.. code:: python
import urllib.request
from chardet.universaldetector import UniversalDetector
usock = urllib.request.urlopen('http://yahoo.co.jp/')
detector = UniversalDetector()
for line in usock.readlines():
detector.feed(line)
if detector.done: break
detector.close()
usock.close()
print(detector.result)
.. code:: python
{'encoding': 'EUC-JP', 'confidence': 0.99}
If you want to detect the encoding of multiple texts (such as separate
files), you can re-use a single ``UniversalDetector`` object. Just call
``detector.reset()`` at the start of each file, call ``detector.feed``
as many times as you like, and then call ``detector.close()`` and check
the ``detector.result`` dictionary for the files results.
Example: Detecting encodings of multiple files
----------------------------------------------
.. code:: python
import glob
from chardet.universaldetector import UniversalDetector
detector = UniversalDetector()
for filename in glob.glob('*.xml'):
print(filename.ljust(60), end='')
detector.reset()
for line in open(filename, 'rb'):
detector.feed(line)
if detector.done: break
detector.close()
print(detector.result)