commit 0873d9e68879f7a8f814f656bf0e488b6d2b64b6
Author: Seiun <140777969+lonh-jing@users.noreply.github.com>
Date:   Wed Feb 11 10:26:47 2026 +0800

    Squashed 'plugins/tts_voice_plugin/' content from commit d14ba1bd
    
    git-subtree-dir: plugins/tts_voice_plugin
    git-subtree-split: d14ba1bdf00b09521a4eab8fd66ee83c64f2314c

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..f937ce2b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,40 @@
+# 敏感配置文件
+config.toml
+config.toml.backup.*
+config.toml.reset.*
+
+# Python缓存文件
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+
+# 虚拟环境
+venv/
+ENV/
+env/
+
+# IDE配置
+.vscode/
+.idea/
+*.swp
+*.swo
+
+# 临时文件
+*.log
+*.tmp
+.DS_Store
+
+# 生成的音频文件
+tts_*.mp3
+tts_*.wav
+tts_*.ogg
+
+# 数据目录（包含临时音频文件）
+data/
+
+# 规范工作流目录
+.spec-workflow/
+
+# Claude配置
+.claude/
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 00000000..0ad25db4
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,661 @@
+                    GNU AFFERO GENERAL PUBLIC LICENSE
+                       Version 3, 19 November 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU Affero General Public License is a free, copyleft license for
+software and other kinds of works, specifically designed to ensure
+cooperation with the community in the case of network server software.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+our General Public Licenses are intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  Developers that use our General Public Licenses protect your rights
+with two steps: (1) assert copyright on the software, and (2) offer
+you this License which gives you legal permission to copy, distribute
+and/or modify the software.
+
+  A secondary benefit of defending all users' freedom is that
+improvements made in alternate versions of the program, if they
+receive widespread use, become available for other developers to
+incorporate.  Many developers of free software are heartened and
+encouraged by the resulting cooperation.  However, in the case of
+software used on network servers, this result may fail to come about.
+The GNU General Public License permits making a modified version and
+letting the public access it on a server without ever releasing its
+source code to the public.
+
+  The GNU Affero General Public License is designed specifically to
+ensure that, in such cases, the modified source code becomes available
+to the community.  It requires the operator of a network server to
+provide the source code of the modified version running there to the
+users of that server.  Therefore, public use of a modified version, on
+a publicly accessible server, gives the public access to the source
+code of the modified version.
+
+  An older license, called the Affero General Public License and
+published by Affero, was designed to accomplish similar goals.  This is
+a different license, not a version of the Affero GPL, but Affero has
+released a new version of the Affero GPL which permits relicensing under
+this license.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU Affero General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Remote Network Interaction; Use with the GNU General Public License.
+
+  Notwithstanding any other provision of this License, if you modify the
+Program, your modified version must prominently offer all users
+interacting with it remotely through a computer network (if your version
+supports such interaction) an opportunity to receive the Corresponding
+Source of your version by providing access to the Corresponding Source
+from a network server at no charge, through some standard or customary
+means of facilitating copying of software.  This Corresponding Source
+shall include the Corresponding Source for any work covered by version 3
+of the GNU General Public License that is incorporated pursuant to the
+following paragraph.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the work with which it is combined will remain governed by version
+3 of the GNU General Public License.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU Affero General Public License from time to time.  Such new versions
+will be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU Affero General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU Affero General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU Affero General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License as published
+    by the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If your software can interact with users remotely through a computer
+network, you should also make sure that it provides a way for users to
+get its source.  For example, if your program is a web application, its
+interface could display a "Source" link that leads users to an archive
+of the code.  There are many ways you could offer source, and different
+solutions will be better for different programs; see section 13 for the
+specific requirements.
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU AGPL, see
+<https://www.gnu.org/licenses/>.
diff --git a/README.md b/README.md
new file mode 100644
index 00000000..9cd5f91c
--- /dev/null
+++ b/README.md
@@ -0,0 +1,311 @@
+# TTS 语音合成插件
+
+MaiBot 的文本转语音插件，支持多种 TTS 后端。
+
+## 支持的后端
+
+| 后端 | 说明 | 适用场景 |
+|------|------|----------|
+| AI Voice | MaiCore 内置，无需配置 | 仅群聊 |
+| GSV2P | 云端 API，需要 Token | 群聊/私聊 |
+| GPT-SoVITS | 本地服务，需自行部署 | 群聊/私聊 |
+| 豆包语音 | 火山引擎云服务，高质量 | 群聊/私聊 |
+| CosyVoice | 阿里云 CosyVoice3，支持方言和声音克隆 | 群聊/私聊 |
+| ComfyUI | 本地 ComfyUI 工作流 API（MLX Qwen3-TTS VoiceClone） | 群聊/私聊 |
+
+## 安装
+
+```bash
+pip install aiohttp gradio_client
+```
+
+## 配置
+
+编辑 `config.toml`，设置默认后端：
+
+```toml
+[general]
+default_backend = "cosyvoice"  # 可选：ai_voice / gsv2p / gpt_sovits / doubao / cosyvoice / comfyui
+audio_output_dir = ""          # 音频输出目录，留空使用项目根目录
+use_base64_audio = false       # 是否使用base64发送（备选方案）
+split_sentences = true         # 是否分段发送语音（长文本逐句发送）
+split_delay = 0.3              # 分段发送间隔时间（秒）
+send_error_messages = true     # 是否发送错误提示消息（false=静默失败）
+```
+
+### Docker环境配置说明
+
+**问题：** Docker环境中可能遇到音频上传失败或文件路径识别错误（如`识别URL失败`）
+
+**解决方案（按推荐顺序）：**
+
+#### 方案1：使用相对路径（推荐）
+
+```toml
+[general]
+audio_output_dir = ""  # 留空，默认使用项目根目录
+```
+
+音频文件将保存在项目根目录，OneBot/NapCat可以正确识别相对路径。
+
+#### 方案2：自定义输出目录
+
+```toml
+[general]
+audio_output_dir = "data/tts_audio"  # 相对路径，相对于项目根目录
+# 或
+audio_output_dir = "/app/data/audio" # 绝对路径
+```
+
+#### 方案3：使用base64编码（备选）
+
+如果路径方案都不生效，可启用base64发送：
+
+```toml
+[general]
+use_base64_audio = true  # 使用base64编码发送（会增加约33%数据大小）
+```
+
+### 豆包语音配置
+
+```toml
+[doubao]
+app_id = "你的APP_ID"
+access_key = "你的ACCESS_KEY"
+resource_id = "seed-tts-2.0"
+default_voice = "zh_female_vv_uranus_bigtts"
+```
+
+**预置音色：**
+
+| 音色名称 | voice_type |
+|----------|------------|
+| vivi 2.0 | zh_female_vv_uranus_bigtts |
+| 大壹 | zh_male_dayi_saturn_bigtts |
+| 黑猫侦探社咪仔 | zh_female_mizai_saturn_bigtts |
+
+**复刻音色：** 将 `resource_id` 改为 `seed-icl-2.0`，`default_voice` 填音色 ID（如 `S_xxxxxx`）
+
+凭证获取：[火山引擎控制台](https://console.volcengine.com/speech/service/8)
+
+### GSV2P 配置
+
+```toml
+[gsv2p]
+api_token = "你的Token"
+default_voice = "原神-中文-派蒙_ZH"
+```
+
+Token 获取：[https://tts.acgnai.top](https://tts.acgnai.top)
+
+### AI Voice 配置
+
+```toml
+[ai_voice]
+default_character = "温柔妹妹"
+```
+
+可用音色：小新、猴哥、妲己、酥心御姐、温柔妹妹、邻家小妹 等 22 种
+
+### GPT-SoVITS 配置
+
+**支持两种配置格式：**
+
+#### 格式1：数组格式（推荐，WebUI 友好）
+
+```toml
+[gpt_sovits]
+server = "http://127.0.0.1:9880"
+
+[[gpt_sovits.styles]]
+name = "default"
+refer_wav = "/path/to/reference.wav"
+prompt_text = "参考文本"
+prompt_language = "zh"
+gpt_weights = "/path/to/model.ckpt"      # 可选：动态模型切换
+sovits_weights = "/path/to/model.pth"    # 可选：动态模型切换
+
+[[gpt_sovits.styles]]
+name = "happy"
+refer_wav = "/path/to/happy.wav"
+prompt_text = "开心的参考文本"
+prompt_language = "zh"
+```
+
+#### 格式2：字典格式（兼容旧版）
+
+```toml
+[gpt_sovits]
+server = "http://127.0.0.1:9880"
+
+[gpt_sovits.styles.default]
+refer_wav = "/path/to/reference.wav"
+prompt_text = "参考文本"
+prompt_language = "zh"
+gpt_weights = "/path/to/model.ckpt"
+sovits_weights = "/path/to/model.pth"
+```
+
+> **提示：** 插件会自动识别并兼容两种格式，推荐使用数组格式以获得更好的 WebUI 支持。
+
+### CosyVoice 配置
+
+```toml
+[cosyvoice]
+gradio_url = "https://funaudiollm-fun-cosyvoice3-0-5b.ms.show/"
+default_mode = "3s极速复刻"           # 或 "自然语言控制"
+default_instruct = "You are a helpful assistant. 请用广东话表达。<|endofprompt|>" # 只有自然语言控制模式才会生效，3s极速复刻模式下不生效
+reference_audio = "/path/to/ref.wav"  # 参考音频路径
+prompt_text = "参考音频对应的文本"      # 参考音频的对应文本
+timeout = 300                          # API超时（秒）
+```
+
+**支持的方言/情感/语速：**
+
+| 类型 | 可用选项 |
+|------|----------|
+| 方言 | 广东话、东北话、四川话、上海话、闽南话、山东话、陕西话、湖南话等17种 |
+| 情感 | 开心、伤心、生气 |
+| 语速 | 慢速、快速 |
+| 音量 | 大声、小声 |
+| 特殊风格 | 小猪佩奇、机器人 |
+
+**推理模式：**
+- `3s极速复刻`：需要提供参考音频进行声音克隆
+- `自然语言控制`：通过指令控制方言、情感、语速等
+
+## 使用方法
+
+### 命令触发
+
+```
+/tts 你好世界                    # 使用默认后端
+/tts 今天天气不错 小新            # 指定音色
+/gsv2p 你好世界                  # 使用 GSV2P
+/doubao 你好世界                 # 使用豆包
+/cosyvoice 你好世界 四川话        # 使用 CosyVoice，四川话
+/comfyui 你好世界 -v default     # 使用 ComfyUI 本地工作流（MLX VoiceClone）
+```
+
+## ComfyUI 后端配置
+
+该后端通过 ComfyUI 的 HTTP API 执行工作流（`/prompt` -> `/history` -> `/view`），并用 `LoadAudio` 从 ComfyUI 的 `input` 目录读取参考音频。
+
+```toml
+[comfyui]
+server = "http://127.0.0.1:8188"
+input_dir = "/Users/xenon/Downloads/seiun_tts/ComfyUI/ComfyUI/input"
+timeout = 120
+audio_quality = "128k" # SaveAudioMP3: V0/128k/320k
+mlx_python = "/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/.venv/bin/python"
+mlx_cli = "/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/mlx_voice_clone_cli.py"
+default_style = "default"
+
+[[comfyui.styles]]
+name = "default"
+refer_wav = "/path/to/ref.wav"
+prompt_text = "参考音频逐字稿"
+language = "Auto" # 可选: Auto/Chinese/English/Japanese...
+model_choice = "1.7B"
+precision = "bf16"
+seed = 0
+max_new_tokens = 2048
+top_p = 0.8
+top_k = 20
+temperature = 1.0
+repetition_penalty = 1.05
+```
+
+### 自动触发
+
+LLM 判断需要语音回复时会自动触发，可通过概率控制：
+
+```toml
+[probability]
+enabled = false          # 默认关闭，每次都触发语音
+base_probability = 0.3   # 启用时 30% 概率触发
+```
+
+### 智能分割插件支持
+
+本插件已适配智能分割插件，支持使用 `|||SPLIT|||` 分隔符进行精确分段：
+
+- **优先级**：智能分割标记 > 自动句子分割 > 单句发送
+- **使用方式**：智能分割插件会在适当位置插入 `|||SPLIT|||` 标记，本插件自动识别并按标记分段发送
+- **示例**：`今天天气不错|||SPLIT|||适合出去玩|||SPLIT|||你觉得呢` 会分成三段语音依次发送
+
+## 项目结构
+
+```
+tts_voice_plugin/
+├── plugin.py          # 插件入口
+├── config.toml        # 配置文件
+├── backends/          # 后端实现
+│   ├── ai_voice.py
+│   ├── gsv2p.py
+│   ├── gpt_sovits.py
+│   ├── doubao.py
+│   └── cosyvoice.py
+└── utils/             # 工具函数
+```
+
+## 常见问题
+
+**Q: Docker环境中提示"文件处理失败 识别URL失败"？**
+A: 留空 `audio_output_dir` 配置项，插件将使用项目根目录保存音频（相对路径）。如仍有问题，可设置 `use_base64_audio = true` 使用base64编码发送。
+
+**Q: AI Voice 提示"仅支持群聊"？**
+A: AI Voice 只能在群聊使用，私聊会自动切换到其他后端。
+
+**Q: 豆包语音怎么获取凭证？**
+A: 登录火山引擎控制台，开通语音合成服务获取。
+
+**Q: 文本太长被截断？**
+A: 修改 `config.toml` 中 `max_text_length = 1000`
+
+**Q: 语音合成失败时不想让Bot发送错误消息？**
+A: 设置 `send_error_messages = false`，语音合成失败时将静默处理，不向用户发送错误提示。
+
+## 更新日志
+
+### v3.2.3
+- 修复豆包语音 WAV 流式响应合并问题（正确处理 LIST/INFO 元数据块和多 header 情况）
+- 默认后端改为 CosyVoice（更稳定的声音克隆体验）
+- 默认关闭概率控制（每次触发都生成语音，更可预期的行为）
+- 优化 LLM 长度约束提示（利用"近因效应"提高遵守率）
+- 优化 action 记录格式，帮助 planner 避免重复执行
+- GSV2P/豆包音频格式默认改为 WAV（更好的兼容性）
+- CosyVoice 默认模式改为 3s 极速复刻（更快响应）
+- 更新默认超时配置（CosyVoice 300s, GSV2P 120s）
+
+### v3.2.2
+- 适配智能分割插件（支持 `|||SPLIT|||` 分隔符精确分段）
+- GPT-SoVITS 支持数组格式配置（WebUI 友好，向后兼容字典格式）
+- 修复豆包语音音色信息显示乱码问题
+- 优化配置文件注释，更简洁清晰
+- 优化分段发送逻辑优先级（智能分割 > 自动分割 > 单句）
+- 禁用 Python 字节码生成（保持目录干净）
+- 添加插件 ID 标识字段
+
+### v3.2.1
+- 新增 `send_error_messages` 配置项（可选择关闭错误提示消息）
+- 统一错误消息处理逻辑（通过 `_send_error` 方法）
+
+### v3.2.0
+- 新增 CosyVoice 后端（阿里云 ModelScope，支持 17 种方言、3 秒声音克隆）
+- 新增分段发送功能（长文本自动分割逐句发送）
+- GPT-SoVITS 支持动态模型切换（在风格配置中指定 gpt_weights/sovits_weights）
+- GSV2P 后端新增重试机制（5 次重试，3 秒间隔）
+- 新增 `/cosyvoice` 命令
+- 新增 gradio_client 依赖
+
+### v3.1.0
+- 新增豆包语音后端（火山引擎云服务）
+- 重构为模块化架构
+- HTTP Session 复用优化
+
+## 信息
+
+- 版本：3.2.3
+- 作者：靓仔
+- 许可：AGPL-v3.0
diff --git a/_manifest.json b/_manifest.json
new file mode 100644
index 00000000..d640b6a3
--- /dev/null
+++ b/_manifest.json
@@ -0,0 +1,235 @@
+{
+  "manifest_version": 1,
+  "name": "统一TTS语音合成插件",
+  "version": "3.2.3",
+  "description": "统一TTS语音合成插件，整合AI Voice、GSV2P、GPT-SoVITS、豆包语音、CosyVoice五种后端引擎，提供灵活的语音合成能力。",
+  "author": {
+    "name": "靓仔",
+    "url": "https://github.com/xuqian13"
+  },
+  "license": "AGPL-v3.0",
+  "homepage_url": "",
+  "repository_url": "https://github.com/xuqian13/tts_voice_plugin",
+  "keywords": [
+    "TTS",
+    "语音合成",
+    "文本转语音",
+    "AI语音",
+    "GSV2P",
+    "GPT-SoVITS",
+    "豆包",
+    "CosyVoice",
+    "火山引擎",
+    "多后端",
+    "语音",
+    "朗读",
+    "音色",
+    "语音播报",
+    "方言",
+    "声音克隆",
+    "MaiCore"
+  ],
+  "categories": [
+    "语音",
+    "AI",
+    "聊天增强",
+    "娱乐",
+    "Utility",
+    "Communication",
+    "Accessibility"
+  ],
+  "host_application": {
+    "min_version": "0.12.0"
+  },
+  "default_locale": "zh-CN",
+  "plugin_info": {
+    "is_built_in": false,
+    "plugin_type": "general",
+    "components": [
+      {
+        "type": "action",
+        "name": "unified_tts_action",
+        "description": "统一TTS语音合成Action，支持四种后端引擎智能切换，LLM自主判断触发"
+      },
+      {
+        "type": "command",
+        "name": "unified_tts_command",
+        "description": "统一TTS命令，支持/tts、/voice、/gsv2p、/doubao多种命令格式，灵活指定后端和音色"
+      }
+    ],
+    "features": [
+      "支持五种TTS后端：AI Voice、GSV2P、GPT-SoVITS、豆包语音、CosyVoice",
+      "AI Voice: MaiCore内置，简单快速，22+预设音色",
+      "GSV2P: 云端API，高质量合成，丰富的调节参数",
+      "GPT-SoVITS: 本地服务，高度定制化，多风格支持",
+      "豆包语音: 字节跳动云服务，支持复刻音色和情感控制",
+      "CosyVoice: 阿里云语音合成，支持17种方言、3秒声音克隆、情感控制",
+      "模块化架构，后端独立实现，易于扩展",
+      "HTTP Session复用，提升性能",
+      "临时文件自动清理，避免并发冲突",
+      "智能触发模式（LLM自主判断）和手动命令模式",
+      "概率控制机制，避免语音回复过于频繁",
+      "智能语言检测（中文/英文/日文）",
+      "文本自动清理和网络用语转换",
+      "完善的错误处理和重试机制",
+      "灵活的配置系统，支持各后端独立配置"
+    ],
+    "dependencies": {
+      "python": [
+        "aiohttp",
+        "gradio_client"
+      ],
+      "system": [],
+      "plugins": []
+    },
+    "backend_info": {
+      "ai_voice": {
+        "provider": "MaiCore内置",
+        "endpoint": "AI_VOICE_SEND命令",
+        "authentication": "无需认证",
+        "limitations": "仅支持群聊使用",
+        "voices": "22+预设音色（小新、妲己、酥心御姐等）"
+      },
+      "gsv2p": {
+        "provider": "GSV2P云服务",
+        "endpoint": "https://gsv2p.acgnai.top/v1/audio/speech",
+        "authentication": "需要API Token",
+        "limitations": "API调用限制",
+        "features": "高质量合成、多语言支持、丰富参数调节"
+      },
+      "gpt_sovits": {
+        "provider": "本地GPT-SoVITS服务",
+        "endpoint": "http://127.0.0.1:9880",
+        "authentication": "无需认证",
+        "limitations": "需要本地部署服务",
+        "features": "高度定制化、多风格支持、模型权重切换"
+      },
+      "doubao": {
+        "provider": "字节跳动火山引擎",
+        "endpoint": "https://openspeech.bytedance.com/api/v3/tts/unidirectional",
+        "authentication": "需要app_id、access_key、resource_id",
+        "limitations": "需要火山引擎账号",
+        "features": "快速高质量、支持复刻音色、情感语气控制"
+      },
+      "cosyvoice": {
+        "provider": "阿里云 CosyVoice",
+        "endpoint": "ModelScope Gradio API",
+        "authentication": "无需认证（公开Gradio接口）",
+        "limitations": "依赖ModelScope服务可用性",
+        "features": "3秒声音克隆、17种方言支持、情感语速控制、自然语言指令"
+      }
+    }
+  },
+  "configuration": {
+    "config_file": "config.toml",
+    "config_template": "config.toml.example",
+    "auto_generate": true,
+    "sections": [
+      {
+        "name": "plugin",
+        "description": "插件基本配置"
+      },
+      {
+        "name": "general",
+        "description": "通用设置（默认后端、超时、文本长度等）"
+      },
+      {
+        "name": "components",
+        "description": "组件启用控制"
+      },
+      {
+        "name": "probability",
+        "description": "概率控制配置（避免语音回复过于频繁）"
+      },
+      {
+        "name": "ai_voice",
+        "description": "AI Voice后端配置（音色映射等）"
+      },
+      {
+        "name": "gsv2p",
+        "description": "GSV2P后端配置（API地址、Token、参数等）"
+      },
+      {
+        "name": "gpt_sovits",
+        "description": "GPT-SoVITS后端配置（服务地址、风格配置等）"
+      },
+      {
+        "name": "doubao",
+        "description": "豆包语音后端配置（火山引擎认证、音色、情感等）"
+      },
+      {
+        "name": "cosyvoice",
+        "description": "CosyVoice后端配置（Gradio URL、模式、方言等）"
+      }
+    ]
+  },
+  "usage_examples": [
+    {
+      "type": "action",
+      "backend": "auto",
+      "description": "LLM自动触发语音回复",
+      "example": "用户：请用语音说\"你好世界\"\n机器人：[使用默认后端自动生成语音文件并发送]"
+    },
+    {
+      "type": "command",
+      "backend": "ai_voice",
+      "description": "手动命令使用AI Voice",
+      "example": "/tts 你好世界 小新"
+    },
+    {
+      "type": "command",
+      "backend": "gsv2p",
+      "description": "手动命令使用GSV2P",
+      "example": "/gsv2p 今天天气不错"
+    },
+    {
+      "type": "command",
+      "backend": "doubao",
+      "description": "手动命令使用豆包语音",
+      "example": "/doubao 你好世界"
+    },
+    {
+      "type": "command",
+      "backend": "gpt_sovits",
+      "description": "手动命令使用GPT-SoVITS",
+      "example": "/tts 测试一下 default gpt_sovits"
+    },
+    {
+      "type": "command",
+      "backend": "cosyvoice",
+      "description": "手动命令使用CosyVoice",
+      "example": "/cosyvoice 你好世界 四川话"
+    },
+    {
+      "type": "command",
+      "backend": "auto",
+      "description": "使用默认后端",
+      "example": "/tts 你好世界"
+    }
+  ],
+  "migration_info": {
+    "from_plugins": [
+      "ai_voice_plugin (v1.0.0)",
+      "gsv2p_tts_plugin (v1.0.0)",
+      "tts_voice_plugin (v2.0.0)",
+      "tts_voice_plugin (v3.0.0)"
+    ],
+    "migration_notes": [
+      "本插件整合了ai_voice_plugin、gsv2p_tts_plugin和旧版tts_voice_plugin的所有功能",
+      "v3.2.2适配智能分割插件（支持|||SPLIT|||分隔符精确分段）",
+      "v3.2.2支持GPT-SoVITS数组格式配置（WebUI友好，向后兼容字典格式）",
+      "v3.2.2修复豆包语音音色信息显示乱码问题",
+      "v3.2.2优化配置文件注释，更简洁清晰",
+      "v3.2.0新增CosyVoice后端支持（阿里云语音合成，支持17种方言和3秒声音克隆）",
+      "v3.1.0新增豆包语音后端支持",
+      "v3.1.0重构为模块化架构，提升代码可维护性",
+      "配置文件需要重新生成，原配置需手动迁移",
+      "建议备份旧插件配置后再迁移",
+      "AI Voice音色映射保持兼容",
+      "GSV2P API配置需重新填写Token",
+      "GPT-SoVITS风格配置需要重新设置",
+      "新增config.toml.example模板文件"
+    ]
+  },
+  "id": "tts_voice_plugin"
+}
\ No newline at end of file
diff --git a/backends/__init__.py b/backends/__init__.py
new file mode 100644
index 00000000..ddcafef1
--- /dev/null
+++ b/backends/__init__.py
@@ -0,0 +1,38 @@
+"""
+TTS后端模块
+"""
+
+import sys
+sys.dont_write_bytecode = True
+
+from .base import TTSBackendBase, TTSBackendRegistry, TTSResult
+from .ai_voice import AIVoiceBackend
+from .gsv2p import GSV2PBackend
+from .gpt_sovits import GPTSoVITSBackend
+from .doubao import DoubaoBackend
+from .cosyvoice import CosyVoiceBackend
+from .comfyui import ComfyUIBackend, ComfyUIVoiceCloneBackend, ComfyUICustomVoiceBackend
+
+# 注册后端
+TTSBackendRegistry.register("ai_voice", AIVoiceBackend)
+TTSBackendRegistry.register("gsv2p", GSV2PBackend)
+TTSBackendRegistry.register("gpt_sovits", GPTSoVITSBackend)
+TTSBackendRegistry.register("doubao", DoubaoBackend)
+TTSBackendRegistry.register("cosyvoice", CosyVoiceBackend)
+TTSBackendRegistry.register("comfyui", ComfyUIBackend)
+TTSBackendRegistry.register("comfyui_voiceclone", ComfyUIVoiceCloneBackend)
+TTSBackendRegistry.register("comfyui_customvoice", ComfyUICustomVoiceBackend)
+
+__all__ = [
+    "TTSBackendBase",
+    "TTSBackendRegistry",
+    "TTSResult",
+    "AIVoiceBackend",
+    "GSV2PBackend",
+    "GPTSoVITSBackend",
+    "DoubaoBackend",
+    "CosyVoiceBackend",
+    "ComfyUIBackend",
+    "ComfyUIVoiceCloneBackend",
+    "ComfyUICustomVoiceBackend",
+]
diff --git a/backends/ai_voice.py b/backends/ai_voice.py
new file mode 100644
index 00000000..c916fa00
--- /dev/null
+++ b/backends/ai_voice.py
@@ -0,0 +1,133 @@
+"""
+AI Voice 后端实现
+使用 MaiCore 内置的 AI 语音功能
+"""
+
+from typing import Optional, Callable, Dict
+from .base import TTSBackendBase, TTSResult
+from ..utils.text import TTSTextUtils
+from ..config_keys import ConfigKeys
+from src.common.logger import get_logger
+
+logger = get_logger("tts_ai_voice")
+
+# AI Voice 音色映射表
+AI_VOICE_ALIAS_MAP = {
+    "小新": "lucy-voice-laibixiaoxin",
+    "猴哥": "lucy-voice-houge",
+    "四郎": "lucy-voice-silang",
+    "东北老妹儿": "lucy-voice-guangdong-f1",
+    "广西大表哥": "lucy-voice-guangxi-m1",
+    "妲己": "lucy-voice-daji",
+    "霸道总裁": "lucy-voice-lizeyan",
+    "酥心御姐": "lucy-voice-suxinjiejie",
+    "说书先生": "lucy-voice-m8",
+    "憨憨小弟": "lucy-voice-male1",
+    "憨厚老哥": "lucy-voice-male3",
+    "吕布": "lucy-voice-lvbu",
+    "元气少女": "lucy-voice-xueling",
+    "文艺少女": "lucy-voice-f37",
+    "磁性大叔": "lucy-voice-male2",
+    "邻家小妹": "lucy-voice-female1",
+    "低沉男声": "lucy-voice-m14",
+    "傲娇少女": "lucy-voice-f38",
+    "爹系男友": "lucy-voice-m101",
+    "暖心姐姐": "lucy-voice-female2",
+    "温柔妹妹": "lucy-voice-f36",
+    "书香少女": "lucy-voice-f34"
+}
+
+
+class AIVoiceBackend(TTSBackendBase):
+    """
+    AI Voice 后端
+
+    使用 MaiCore 内置的 AI 语音功能
+    注意：仅支持群聊环境
+    """
+
+    backend_name = "ai_voice"
+    backend_description = "MaiCore内置AI语音（仅群聊）"
+    support_private_chat = False  # 不支持私聊
+    default_audio_format = ""  # AI Voice不需要音频格式
+
+    def __init__(self, config_getter, log_prefix: str = ""):
+        super().__init__(config_getter, log_prefix)
+        self._send_command = None  # 由外部注入
+
+    def set_send_command(self, send_command_func: Callable) -> None:
+        """设置发送命令的函数（由Action/Command注入）"""
+        self._send_command = send_command_func
+
+    def get_default_voice(self) -> str:
+        """获取默认音色"""
+        return self.get_config(ConfigKeys.AI_VOICE_DEFAULT_CHARACTER, "温柔妹妹")
+
+    def resolve_voice(self, voice: Optional[str]) -> str:
+        """解析音色别名"""
+        alias_map: Dict[str, str] = self.get_config(
+            ConfigKeys.AI_VOICE_ALIAS_MAP,
+            AI_VOICE_ALIAS_MAP
+        )
+        default_voice = self.get_default_voice()
+        return TTSTextUtils.resolve_voice_alias(
+            voice,
+            alias_map,
+            default_voice,
+            prefix="lucy-voice-"
+        )
+
+    async def execute(
+        self,
+        text: str,
+        voice: Optional[str] = None,
+        **kwargs
+    ) -> TTSResult:
+        """
+        执行AI Voice语音合成
+
+        Args:
+            text: 待转换的文本
+            voice: 音色名称或别名
+
+        Returns:
+            TTSResult
+        """
+        if not self._send_command:
+            return TTSResult(
+                success=False,
+                message="AI Voice后端未正确初始化（缺少send_command）",
+                backend_name=self.backend_name
+            )
+
+        # 解析音色
+        character = self.resolve_voice(voice)
+
+        try:
+            success = await self._send_command(
+                command_name="AI_VOICE_SEND",
+                args={"text": text, "character": character},
+                storage_message=False
+            )
+
+            if success:
+                logger.info(f"{self.log_prefix} AI语音发送成功 (音色: {character})")
+                return TTSResult(
+                    success=True,
+                    message=f"成功发送AI语音 (音色: {character})",
+                    backend_name=self.backend_name
+                )
+            else:
+                return TTSResult(
+                    success=False,
+                    message="AI语音命令发送失败",
+                    backend_name=self.backend_name
+                )
+
+        except Exception as e:
+            logger.error(f"{self.log_prefix} AI语音执行错误: {e}")
+            return TTSResult(
+                success=False,
+                message=f"AI语音执行错误: {e}",
+                backend_name=self.backend_name
+            )
diff --git a/backends/base.py b/backends/base.py
new file mode 100644
index 00000000..9d8936f4
--- /dev/null
+++ b/backends/base.py
@@ -0,0 +1,239 @@
+"""
+TTS后端抽象基类和注册表
+"""
+
+import asyncio
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Dict, Type, Optional, Any, Callable, Tuple, Union
+from src.common.logger import get_logger
+from ..config_keys import ConfigKeys
+
+logger = get_logger("tts_backend")
+
+
+@dataclass
+class TTSResult:
+    """TTS执行结果"""
+    success: bool
+    message: str
+    audio_path: Optional[str] = None
+    backend_name: str = ""
+
+    def __iter__(self):
+        """支持解包为 (success, message)"""
+        return iter((self.success, self.message))
+
+
+class TTSBackendBase(ABC):
+    """
+    TTS后端抽象基类
+
+    所有TTS后端必须继承此类并实现 execute 方法
+    """
+
+    # 后端名称（子类必须覆盖）
+    backend_name: str = "base"
+
+    # 后端描述
+    backend_description: str = "TTS后端基类"
+
+    # 是否支持私聊
+    support_private_chat: bool = True
+
+    # 默认音频格式
+    default_audio_format: str = "mp3"
+
+    def __init__(self, config_getter: Callable[[str, Any], Any], log_prefix: str = ""):
+        """
+        初始化后端
+
+        Args:
+            config_getter: 配置获取函数，签名为 get_config(key, default)
+            log_prefix: 日志前缀
+        """
+        self.get_config = config_getter
+        self.log_prefix = log_prefix or f"[{self.backend_name}]"
+        self._send_custom = None
+
+    def set_send_custom(self, send_custom_func: Callable) -> None:
+        """设置发送自定义消息的函数"""
+        self._send_custom = send_custom_func
+
+    async def send_audio(
+        self,
+        audio_data: bytes,
+        audio_format: str = "mp3",
+        prefix: str = "tts",
+        voice_info: str = ""
+    ) -> TTSResult:
+        """
+        统一的音频发送方法
+
+        Args:
+            audio_data: 音频二进制数据
+            audio_format: 音频格式（如mp3、wav）
+            prefix: 文件名前缀
+            voice_info: 音色信息（用于日志）
+
+        Returns:
+            TTSResult
+        """
+        from ..utils.file import TTSFileManager
+
+        # 检查是否使用base64发送
+        use_base64 = self.get_config(ConfigKeys.GENERAL_USE_BASE64_AUDIO, False)
+        logger.debug(f"{self.log_prefix} 开始发送音频 (原始大小: {len(audio_data)}字节, 格式: {audio_format})")
+
+        if use_base64:
+            # 使用base64编码发送
+            base64_audio = TTSFileManager.audio_to_base64(audio_data)
+            if not base64_audio:
+                return TTSResult(False, "音频数据转base64失败", backend_name=self.backend_name)
+
+            logger.debug(f"{self.log_prefix} base64编码完成，准备通过send_custom发送")
+            if self._send_custom:
+                await self._send_custom(message_type="voice", content=base64_audio)
+                logger.info(f"{self.log_prefix} 语音已通过send_custom发送 (base64模式, 音频大小: {len(audio_data)}字节)")
+            else:
+                logger.warning(f"{self.log_prefix} send_custom未设置，无法发送语音")
+                return TTSResult(False, "send_custom回调未设置", backend_name=self.backend_name)
+
+            return TTSResult(
+                success=True,
+                message=f"成功发送{self.backend_name}语音{(' ('+voice_info+')') if voice_info else ''}, base64模式",
+                backend_name=self.backend_name
+            )
+        else:
+            # 使用文件路径发送
+            output_dir = self.get_config(ConfigKeys.GENERAL_AUDIO_OUTPUT_DIR, "")
+            audio_path = TTSFileManager.generate_temp_path(
+                prefix=prefix,
+                suffix=f".{audio_format}",
+                output_dir=output_dir
+            )
+
+            if not await TTSFileManager.write_audio_async(audio_path, audio_data):
+                return TTSResult(False, "保存音频文件失败", backend_name=self.backend_name)
+
+            logger.debug(f"{self.log_prefix} 音频文件已保存, 路径: {audio_path}")
+            # 发送语音
+            if self._send_custom:
+                await self._send_custom(message_type="voiceurl", content=audio_path)
+                logger.info(f"{self.log_prefix} 语音已通过send_custom发送 (文件路径模式, 路径: {audio_path})")
+                # 延迟清理临时文件
+                asyncio.create_task(TTSFileManager.cleanup_file_async(audio_path, delay=30))
+            else:
+                logger.warning(f"{self.log_prefix} send_custom未设置，无法发送语音")
+                return TTSResult(False, "send_custom回调未设置", backend_name=self.backend_name)
+
+            return TTSResult(
+                success=True,
+                message=f"成功发送{self.backend_name}语音{(' ('+voice_info+')') if voice_info else ''}",
+                audio_path=audio_path,
+                backend_name=self.backend_name
+            )
+
+    @abstractmethod
+    async def execute(
+        self,
+        text: str,
+        voice: Optional[str] = None,
+        **kwargs
+    ) -> TTSResult:
+        """
+        执行TTS转换
+
+        Args:
+            text: 待转换的文本
+            voice: 音色/风格
+            **kwargs: 其他参数（如emotion等）
+
+        Returns:
+            TTSResult 包含执行结果
+        """
+        raise NotImplementedError
+
+    def validate_config(self) -> Tuple[bool, str]:
+        """
+        验证后端配置是否完整
+
+        Returns:
+            (is_valid, error_message)
+        """
+        return True, ""
+
+    def get_default_voice(self) -> str:
+        """获取默认音色"""
+        return ""
+
+    def is_available(self) -> bool:
+        """检查后端是否可用"""
+        is_valid, _ = self.validate_config()
+        return is_valid
+
+
+class TTSBackendRegistry:
+    """
+    TTS后端注册表
+
+    使用策略模式 + 工厂模式管理后端
+    """
+
+    _backends: Dict[str, Type[TTSBackendBase]] = {}
+
+    @classmethod
+    def register(cls, name: str, backend_class: Type[TTSBackendBase]) -> None:
+        """
+        注册后端
+
+        Args:
+            name: 后端名称
+            backend_class: 后端类
+        """
+        cls._backends[name] = backend_class
+        logger.debug(f"注册TTS后端: {name}")
+
+    @classmethod
+    def unregister(cls, name: str) -> None:
+        """注销后端"""
+        if name in cls._backends:
+            del cls._backends[name]
+
+    @classmethod
+    def get(cls, name: str) -> Optional[Type[TTSBackendBase]]:
+        """获取后端类"""
+        return cls._backends.get(name)
+
+    @classmethod
+    def create(
+        cls,
+        name: str,
+        config_getter: Callable[[str, Any], Any],
+        log_prefix: str = ""
+    ) -> Optional[TTSBackendBase]:
+        """
+        创建后端实例
+
+        Args:
+            name: 后端名称
+            config_getter: 配置获取函数
+            log_prefix: 日志前缀
+
+        Returns:
+            后端实例或None
+        """
+        backend_class = cls.get(name)
+        if backend_class:
+            return backend_class(config_getter, log_prefix)
+        return None
+
+    @classmethod
+    def list_backends(cls) -> list[str]:
+        """列出所有已注册的后端名称"""
+        return list(cls._backends.keys())
+
+    @classmethod
+    def is_registered(cls, name: str) -> bool:
+        """检查后端是否已注册"""
+        return name in cls._backends
diff --git a/backends/comfyui.py b/backends/comfyui.py
new file mode 100644
index 00000000..d574e9fe
--- /dev/null
+++ b/backends/comfyui.py
@@ -0,0 +1,827 @@
+"""
+ComfyUI backend (Workflow API).
+
+This backend calls a fixed ComfyUI prompt graph that:
+LoadAudio -> MLX_Qwen3TTSVoiceClone -> SaveAudioMP3
+
+Rationale:
+- ComfyUI expects API-format "prompt" graphs (not UI workflow json).
+- For audio inputs, the simplest reliable path is to copy the reference audio into ComfyUI/input
+  and use the built-in LoadAudio node.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import hashlib
+import os
+import re
+import time
+import uuid
+from typing import Any, ClassVar, Dict, Optional, Tuple
+from urllib.parse import urlencode
+
+from src.common.logger import get_logger
+from src.plugin_system.apis import generator_api
+
+from .base import TTSBackendBase, TTSResult
+from ..config_keys import ConfigKeys
+from ..utils.file import TTSFileManager
+from ..utils.session import TTSSessionManager
+from ..utils.text import TTSTextUtils
+
+logger = get_logger("tts_comfyui")
+
+
+LANG_TO_DEMO = {
+    "zh": "Chinese",
+    "ja": "Japanese",
+    "en": "English",
+}
+
+
+class ComfyUIBackend(TTSBackendBase):
+    backend_name = "comfyui"
+    backend_description = "ComfyUI 工作流 API（MLX Qwen3-TTS VoiceClone/CustomVoice）"
+    support_private_chat = True
+    default_audio_format = "mp3"
+
+    _ref_cache: ClassVar[Dict[str, str]] = {}
+    _instruct_cache: ClassVar[Dict[str, str]] = {}
+    # If set by subclasses, only these modes are allowed (e.g. {"voice_clone"}).
+    allowed_modes: ClassVar[Optional[set[str]]] = None
+
+    def get_default_voice(self) -> str:
+        return self.get_config(ConfigKeys.COMFYUI_DEFAULT_STYLE, "default")
+
+    def _filter_styles_by_mode(self, styles: Dict[str, Any]) -> Dict[str, Any]:
+        allowed = self.allowed_modes
+        if not allowed:
+            return styles
+        out: Dict[str, Any] = {}
+        for name, st in (styles or {}).items():
+            if not isinstance(st, dict):
+                continue
+            mode = str(st.get("mode") or "voice_clone").strip()
+            if mode in allowed:
+                out[name] = st
+        return out
+
+    def _normalize_styles_config(self, styles_config: Any) -> Dict[str, Any]:
+        # Match GPT-SoVITS backend style schema: list[{name,...}] or dict{name:{...}}
+        if isinstance(styles_config, dict):
+            return styles_config
+        if isinstance(styles_config, list):
+            result = {}
+            for style in styles_config:
+                if isinstance(style, dict) and "name" in style:
+                    name = style["name"]
+                    result[name] = {k: v for k, v in style.items() if k != "name"}
+            return result
+        return {}
+
+    def _clean_instruct(self, s: str, max_chars: int) -> str:
+        s = (s or "").strip()
+        if not s:
+            return ""
+
+        # Strip common wrappers.
+        s = s.replace("```", "").strip()
+        s = re.sub(r"^instruct\\s*[:：]\\s*", "", s, flags=re.IGNORECASE).strip()
+
+        # Prefer first non-empty line.
+        for line in s.splitlines():
+            line = line.strip()
+            if line:
+                s = line
+                break
+
+        # Trim quotes.
+        if len(s) >= 2 and ((s[0] == s[-1] == '"') or (s[0] == s[-1] == "'")):
+            s = s[1:-1].strip()
+
+        if max_chars and len(s) > max_chars:
+            s = s[:max_chars].rstrip()
+        return s
+
+    def _clean_base_tone(self, s: str) -> str:
+        """
+        Clean a base tone/persona string so it can safely live inside `基调=...`:
+        - single-line
+        - no semicolons (they are field separators)
+        - no '=' (KV separator)
+        """
+        s = (s or "").strip()
+        if not s:
+            return ""
+        s = s.replace("\r", " ").replace("\n", " ")
+        s = re.sub(r"\\s+", " ", s).strip()
+        # Avoid breaking KV parsing.
+        s = s.replace("；", ",").replace(";", ",")
+        s = s.replace("＝", " ").replace("=", " ")
+        return s.strip(" ,，")
+
+    def _attach_base_tone(self, instruct: str, max_chars: int) -> str:
+        """
+        If configured, prefix inferred instruct with a fixed base tone/persona:
+        `基调=<...>;情绪=...;语速=...;停顿=...`
+
+        Priority when trimming: keep the inferred instruct fields intact if possible.
+        """
+        base_raw = self.get_config(ConfigKeys.COMFYUI_AUTO_INSTRUCT_BASE_TONE, "") or ""
+        base = self._clean_base_tone(str(base_raw))
+        if not base:
+            return (instruct or "").strip()
+
+        s = (instruct or "").strip()
+        fields = self._parse_instruct_fields(s)
+        if "基调" in fields:
+            return s
+
+        prefix = f"基调={base}"
+        if not s:
+            return prefix[:max_chars].rstrip() if max_chars else prefix
+
+        combined = f"{prefix};{s}"
+        if not max_chars or len(combined) <= max_chars:
+            return combined
+
+        # Too long: try trimming base first, keeping inferred instruct intact.
+        remain = max_chars - len(s) - len(";") - len("基调=")
+        if remain <= 0:
+            # Can't fit base at all; keep instruct (already max_chars-limited upstream).
+            return s[:max_chars].rstrip()
+        base_trim = base[:remain].rstrip(" ,，")
+        return f"基调={base_trim};{s}"
+
+    def _parse_instruct_fields(self, instruct: str) -> Dict[str, str]:
+        """
+        Parse a 1-line instruct like:
+        情绪=愤怒;语速=很快;停顿=很少;表现=咬牙切齿
+
+        We only *use* a few keys (情绪/语速/停顿/强度/表现...), but keep it generic.
+        """
+        s = (instruct or "").strip()
+        if not s:
+            return {}
+
+        # Normalize separators (full-width punctuation).
+        s = s.replace("；", ";").replace("：", ":").replace("＝", "=")
+
+        # Split by semicolon/comma-like separators.
+        parts = [p.strip() for p in re.split(r"[;]+", s) if p.strip()]
+        out: Dict[str, str] = {}
+        for p in parts:
+            if "=" not in p:
+                continue
+            k, v = p.split("=", 1)
+            k = k.strip()
+            v = v.strip()
+            if not k or not v:
+                continue
+            # Limit key length to avoid garbage.
+            if len(k) > 8:
+                continue
+            out[k] = v
+        return out
+
+    def _map_speed_label(self, label: str) -> Optional[float]:
+        lab = (label or "").strip()
+        m = {
+            "很慢": 0.85,
+            "稍慢": 0.93,
+            "正常": 1.00,
+            "稍快": 1.07,
+            "很快": 1.15,
+        }
+        return m.get(lab)
+
+    def _map_pause_label(self, label: str) -> Optional[float]:
+        lab = (label or "").strip()
+        m = {
+            "很少": 0.6,
+            "自然": 1.0,
+            "稍多": 1.3,
+            "很多": 1.7,
+        }
+        return m.get(lab)
+
+    def _ensure_base_pause_cfg(self, pause_cfg: Dict[str, float]) -> Dict[str, float]:
+        # If caller didn't configure pauses (all zeros), apply a conservative base so "停顿" can take effect.
+        keys = ["pause_linebreak", "period_pause", "comma_pause", "question_pause", "hyphen_pause"]
+        if all(float(pause_cfg.get(k, 0.0) or 0.0) == 0.0 for k in keys):
+            return {
+                **pause_cfg,
+                "pause_linebreak": 0.18,
+                "period_pause": 0.22,
+                "comma_pause": 0.10,
+                "question_pause": 0.20,
+                "hyphen_pause": 0.06,
+            }
+        return pause_cfg
+
+    def _enrich_instruct_for_emotion(self, instruct: str, max_chars: int) -> str:
+        """
+        Add short performance cues for common emotions, keeping it single-line KV style.
+        This helps when the model under-reacts to simple labels like "愤怒".
+        """
+        s = (instruct or "").strip()
+        if not s:
+            return ""
+
+        fields = self._parse_instruct_fields(s)
+        emo = fields.get("情绪", "")
+        if not emo:
+            return s
+
+        # Only add if it doesn't already contain a "表现=" field.
+        if "表现" in fields:
+            return s
+
+        emo_norm = emo
+        cues = ""
+        if "愤怒" in emo_norm or "生气" in emo_norm:
+            cues = "声压高,咬字重,重音强,尾音下压"
+        elif "开心" in emo_norm or "高兴" in emo_norm:
+            cues = "笑意明显,轻快上扬,尾音明亮"
+        elif "悲伤" in emo_norm or "难过" in emo_norm:
+            cues = "气声略多,音量偏低,语尾下沉"
+        elif "温柔" in emo_norm:
+            cues = "音量轻,气声柔,语尾轻收"
+        elif "冷淡" in emo_norm or "冷静" in emo_norm:
+            cues = "平直克制,少起伏,干净收尾"
+
+        if not cues:
+            return s
+
+        extra = f";表现={cues}"
+        if max_chars and len(s) + len(extra) > max_chars:
+            # Trim cues to fit.
+            allow = max_chars - len(s) - len(";表现=")
+            if allow <= 0:
+                return s[:max_chars].rstrip()
+            cues = cues[:allow].rstrip(",， ")
+            extra = f";表现={cues}"
+        return (s + extra)[:max_chars].rstrip() if max_chars else (s + extra)
+
+    def _apply_instruct_controls(
+        self, instruct: str, speed: float, pause_cfg: Dict[str, float], max_chars: int
+    ) -> Tuple[str, float, Dict[str, float]]:
+        """
+        If instruct contains '语速'/'停顿', map them to real synthesis controls.
+        This makes auto_instruct meaningfully affect output even if the model is insensitive to labels.
+        """
+        s = (instruct or "").strip()
+        if not s:
+            return "", speed, pause_cfg
+
+        fields = self._parse_instruct_fields(s)
+        speed_label = fields.get("语速", "")
+        pause_label = fields.get("停顿", "")
+
+        out_speed = float(speed)
+        mapped_speed = self._map_speed_label(speed_label)
+        if mapped_speed is not None:
+            out_speed = mapped_speed
+
+        out_pause_cfg = dict(pause_cfg or {})
+        mapped_pause = self._map_pause_label(pause_label)
+        if mapped_pause is not None:
+            out_pause_cfg = self._ensure_base_pause_cfg(out_pause_cfg)
+            for k in ["pause_linebreak", "period_pause", "comma_pause", "question_pause", "hyphen_pause"]:
+                try:
+                    out_pause_cfg[k] = float(out_pause_cfg.get(k, 0.0) or 0.0) * float(mapped_pause)
+                except Exception:
+                    pass
+
+        # Add short performance cues (kept within max_chars).
+        s = self._enrich_instruct_for_emotion(s, max_chars=max_chars)
+        return s, out_speed, out_pause_cfg
+
+    async def _infer_instruct(
+        self,
+        text: str,
+        detected_lang: str,
+        chat_stream=None,
+        chat_id: Optional[str] = None,
+        style_name: str = "",
+    ) -> str:
+        """
+        Infer a short CustomVoice `instruct` string from the target text via MaiBot's LLM interface.
+        """
+        enabled = bool(self.get_config(ConfigKeys.COMFYUI_AUTO_INSTRUCT_ENABLED, False))
+        if not enabled:
+            return ""
+
+        max_chars = int(self.get_config(ConfigKeys.COMFYUI_AUTO_INSTRUCT_MAX_CHARS, 40) or 40)
+
+        # Default prompt: output ONE short instruct line only.
+        default_tpl = (
+            "你是配音导演。请根据要朗读的文本生成一行 TTS instruct。\\n"
+            "硬性要求：必须同时包含【情绪】【语速】【停顿】三项。可以额外补充 1-2 个表演提示（如 音量/重音/音高/表现）。\\n"
+            "只输出一行，不要解释，不要复述原文，不要引号/代码块。\\n"
+            "输出格式固定为：情绪=<...>;语速=<...>;停顿=<...>\\n"
+            "语速可选：很慢/稍慢/正常/稍快/很快。\\n"
+            "停顿可选：很少/自然/稍多/很多。\\n"
+            "长度<= {max_chars} 字。\\n"
+            "文本语言: {lang}\\n"
+            "待朗读文本: {text}\\n"
+        )
+        prompt_tpl = str(self.get_config(ConfigKeys.COMFYUI_AUTO_INSTRUCT_PROMPT, default_tpl) or "")
+        if not prompt_tpl.strip():
+            prompt_tpl = default_tpl
+
+        # Cache key should change if prompt/base_tone/max_chars changes.
+        base_raw = str(self.get_config(ConfigKeys.COMFYUI_AUTO_INSTRUCT_BASE_TONE, "") or "")
+        cfg_sig_src = f"{max_chars}\\n{prompt_tpl}\\n{base_raw}"
+        cfg_sig = hashlib.sha256(cfg_sig_src.encode("utf-8")).hexdigest()[:12]
+        text_sig = hashlib.sha256(text.encode("utf-8")).hexdigest()[:16]
+        cache_key = f"{cfg_sig}:{detected_lang}:{text_sig}"
+        cached = self._instruct_cache.get(cache_key)
+        if cached:
+            return cached
+
+        lang = detected_lang or "auto"
+        prompt = prompt_tpl.format(text=text.strip(), lang=lang, max_chars=max_chars)
+
+        try:
+            resp = await generator_api.generate_tts_instruct(
+                prompt=prompt,
+                request_type="tts_instruct",
+            )
+            instruct = self._clean_instruct(resp or "", max_chars=max_chars)
+            instruct = self._attach_base_tone(instruct, max_chars=max_chars)
+            if instruct:
+                self._instruct_cache[cache_key] = instruct
+            return instruct
+        except Exception as e:
+            logger.warning(f"{self.log_prefix} auto_instruct 失败(style={style_name}): {e}")
+            return ""
+
+    def validate_config(self) -> Tuple[bool, str]:
+        server = self.get_config(ConfigKeys.COMFYUI_SERVER, "http://127.0.0.1:8188")
+        if not server:
+            return False, "ComfyUI 未配置 server"
+
+        input_dir = self.get_config(
+            ConfigKeys.COMFYUI_INPUT_DIR,
+            "/Users/xenon/Downloads/seiun_tts/ComfyUI/ComfyUI/input",
+        )
+        if not input_dir:
+            return False, "ComfyUI 未配置 input_dir"
+
+        styles_raw = self.get_config(ConfigKeys.COMFYUI_STYLES, {})
+        styles = self._normalize_styles_config(styles_raw)
+        if not styles:
+            return False, "ComfyUI 后端未配置任何风格（至少需要配置 1 个 style）"
+
+        default_name = self.get_default_voice() or "default"
+        if default_name not in styles:
+            # Fallback to "default" if present.
+            if "default" in styles:
+                default_name = "default"
+            else:
+                return False, f"ComfyUI default_style='{default_name}' 不存在"
+
+        st = styles.get(default_name, {})
+        mode = (st.get("mode") or "voice_clone").strip()
+        if mode == "voice_clone":
+            if not st.get("refer_wav") or not st.get("prompt_text"):
+                return False, f"ComfyUI 风格 '{default_name}' 配置不完整（voice_clone 需要 refer_wav 和 prompt_text）"
+        elif mode == "custom_voice":
+            if not st.get("model_path") or not st.get("speaker"):
+                return False, f"ComfyUI 风格 '{default_name}' 配置不完整（custom_voice 需要 model_path 和 speaker）"
+        else:
+            return False, f"ComfyUI 风格 '{default_name}' mode 无效: {mode}"
+
+        return True, ""
+
+    def _ensure_ref_in_input(self, input_dir: str, refer_wav: str) -> str:
+        refer_wav = TTSFileManager.resolve_path(refer_wav)
+        if not os.path.exists(refer_wav):
+            raise FileNotFoundError(f"参考音频不存在: {refer_wav}")
+
+        st = os.stat(refer_wav)
+        cache_key = f"{os.path.abspath(refer_wav)}:{st.st_mtime_ns}:{st.st_size}"
+        if cache_key in self._ref_cache:
+            name = self._ref_cache[cache_key]
+            if os.path.exists(os.path.join(input_dir, name)):
+                return name
+
+        ext = os.path.splitext(refer_wav)[1] or ".wav"
+        h = hashlib.sha256(cache_key.encode("utf-8")).hexdigest()[:16]
+        name = f"maibot_ref_{h}{ext}"
+        dst = os.path.join(input_dir, name)
+
+        os.makedirs(input_dir, exist_ok=True)
+        if not os.path.exists(dst):
+            # Keep it simple: copy file bytes. LoadAudio can decode common formats (wav/mp3).
+            import shutil
+
+            shutil.copyfile(refer_wav, dst)
+
+        self._ref_cache[cache_key] = name
+        return name
+
+    def _build_prompt_voice_clone(
+        self,
+        ref_filename: str,
+        ref_text: str,
+        target_text: str,
+        language: str,
+        model_choice: str,
+        precision: str,
+        seed: int,
+        max_new_tokens: int,
+        top_p: float,
+        top_k: int,
+        temperature: float,
+        repetition_penalty: float,
+        audio_quality: str,
+        mlx_python: str,
+        mlx_cli: str,
+        pause_cfg: Dict[str, float],
+    ) -> Dict[str, Any]:
+        # Node IDs are arbitrary but stable in this prompt template.
+        # 1: LoadAudio -> outputs AUDIO
+        # 2: Pause config (FB_Qwen3TTSConfig) -> outputs TTS_CONFIG
+        # 3: MLX VoiceClone -> outputs AUDIO
+        # 4: SaveAudioMP3 -> outputs UI audio file info
+        filename_prefix = f"audio/maibot_comfyui_{int(time.time())}_{uuid.uuid4().hex[:8]}"
+        prompt: Dict[str, Any] = {
+            "1": {
+                "class_type": "LoadAudio",
+                "inputs": {
+                    "audio": ref_filename,
+                },
+            },
+            "2": {
+                "class_type": "FB_Qwen3TTSConfig",
+                "inputs": {
+                    "pause_linebreak": float(pause_cfg.get("pause_linebreak", 0.0)),
+                    "period_pause": float(pause_cfg.get("period_pause", 0.0)),
+                    "comma_pause": float(pause_cfg.get("comma_pause", 0.0)),
+                    "question_pause": float(pause_cfg.get("question_pause", 0.0)),
+                    "hyphen_pause": float(pause_cfg.get("hyphen_pause", 0.0)),
+                },
+            },
+            "3": {
+                "class_type": "MLX_Qwen3TTSVoiceClone",
+                "inputs": {
+                    "target_text": target_text,
+                    "model_choice": model_choice,
+                    "device": "auto",
+                    "precision": precision,
+                    "language": language,
+                    "ref_audio": ["1", 0],
+                    "ref_text": ref_text,
+                    "seed": int(seed),
+                    "max_new_tokens": int(max_new_tokens),
+                    "top_p": float(top_p),
+                    "top_k": int(top_k),
+                    "temperature": float(temperature),
+                    "repetition_penalty": float(repetition_penalty),
+                    "attention": "auto",
+                    "unload_model_after_generate": False,
+                    "config": ["2", 0],
+                    "mlx_python": mlx_python,
+                    "mlx_cli": mlx_cli,
+                },
+            },
+            "4": {
+                "class_type": "SaveAudioMP3",
+                "inputs": {
+                    "audio": ["3", 0],
+                    "filename_prefix": filename_prefix,
+                    "quality": audio_quality,
+                },
+            },
+        }
+        return prompt
+
+    def _build_prompt_custom_voice(
+        self,
+        target_text: str,
+        speaker: str,
+        model_path: str,
+        instruct: str,
+        speed: float,
+        language: str,
+        seed: int,
+        max_new_tokens: int,
+        top_p: float,
+        top_k: int,
+        temperature: float,
+        repetition_penalty: float,
+        audio_quality: str,
+        mlx_python: str,
+        mlx_cli: str,
+        pause_cfg: Dict[str, float],
+    ) -> Dict[str, Any]:
+        # 2: Pause config (FB_Qwen3TTSConfig) -> outputs TTS_CONFIG
+        # 3: MLX CustomVoice -> outputs AUDIO
+        # 4: SaveAudioMP3 -> outputs UI audio file info
+        filename_prefix = f"audio/maibot_comfyui_{int(time.time())}_{uuid.uuid4().hex[:8]}"
+        prompt: Dict[str, Any] = {
+            "2": {
+                "class_type": "FB_Qwen3TTSConfig",
+                "inputs": {
+                    "pause_linebreak": float(pause_cfg.get("pause_linebreak", 0.0)),
+                    "period_pause": float(pause_cfg.get("period_pause", 0.0)),
+                    "comma_pause": float(pause_cfg.get("comma_pause", 0.0)),
+                    "question_pause": float(pause_cfg.get("question_pause", 0.0)),
+                    "hyphen_pause": float(pause_cfg.get("hyphen_pause", 0.0)),
+                },
+            },
+            "3": {
+                "class_type": "MLX_Qwen3TTSCustomVoice",
+                "inputs": {
+                    "text": target_text,
+                    "speaker": speaker,
+                    "model_path": model_path,
+                    "instruct": instruct or "",
+                    "speed": float(speed),
+                    "language": language,
+                    "seed": int(seed),
+                    "max_new_tokens": int(max_new_tokens),
+                    "top_p": float(top_p),
+                    "top_k": int(top_k),
+                    "temperature": float(temperature),
+                    "repetition_penalty": float(repetition_penalty),
+                    "config": ["2", 0],
+                    "mlx_python": mlx_python,
+                    "mlx_cli": mlx_cli,
+                },
+            },
+            "4": {
+                "class_type": "SaveAudioMP3",
+                "inputs": {
+                    "audio": ["3", 0],
+                    "filename_prefix": filename_prefix,
+                    "quality": audio_quality,
+                },
+            },
+        }
+        return prompt
+
+    async def _queue_and_wait(
+        self, server: str, prompt: Dict[str, Any], timeout: int
+    ) -> Dict[str, Any]:
+        session_manager = await TTSSessionManager.get_instance()
+        prompt_id = str(uuid.uuid4())
+
+        post_url = f"{server.rstrip('/')}/prompt"
+        payload = {
+            "prompt": prompt,
+            "client_id": "maibot-tts-voice-plugin",
+            "prompt_id": prompt_id,
+        }
+
+        async with session_manager.post(
+            post_url, json=payload, backend_name=self.backend_name, timeout=timeout
+        ) as resp:
+            data = await resp.json(content_type=None)
+            if resp.status != 200:
+                raise RuntimeError(f"ComfyUI /prompt 失败: {resp.status} {str(data)[:200]}")
+            if "error" in data:
+                raise RuntimeError(f"ComfyUI /prompt 返回错误: {data['error']}")
+
+        # Poll history until prompt_id appears
+        hist_url = f"{server.rstrip('/')}/history/{prompt_id}"
+        deadline = time.time() + float(timeout)
+        while time.time() < deadline:
+            async with session_manager.get(
+                hist_url, backend_name=self.backend_name, timeout=timeout
+            ) as resp:
+                history = await resp.json(content_type=None)
+            if prompt_id in history:
+                return history[prompt_id]
+            await asyncio.sleep(0.35)
+
+        raise TimeoutError("等待 ComfyUI 生成超时")
+
+    async def _download_output_audio(self, server: str, history_item: Dict[str, Any], timeout: int) -> bytes:
+        outputs = history_item.get("outputs") or {}
+        node_out = outputs.get("4") or {}
+        audios = node_out.get("audio") or []
+        if not audios:
+            # Some failures show up only in status/messages.
+            status = history_item.get("status") or {}
+            raise RuntimeError(f"ComfyUI 未产出音频. status={status}")
+
+        a0 = audios[0]
+        filename = a0.get("filename")
+        subfolder = a0.get("subfolder", "")
+        folder_type = a0.get("type", "output")
+        if not filename:
+            raise RuntimeError(f"ComfyUI 音频输出结构异常: {a0}")
+
+        q = urlencode({"filename": filename, "subfolder": subfolder, "type": folder_type})
+        url = f"{server.rstrip('/')}/view?{q}"
+
+        session_manager = await TTSSessionManager.get_instance()
+        async with session_manager.get(url, backend_name=self.backend_name, timeout=timeout) as resp:
+            if resp.status != 200:
+                txt = await resp.text()
+                raise RuntimeError(f"ComfyUI /view 失败: {resp.status} {txt[:200]}")
+            return await resp.read()
+
+    async def execute(self, text: str, voice: Optional[str] = None, **kwargs) -> TTSResult:
+        is_valid, err = self.validate_config()
+        if not is_valid:
+            return TTSResult(False, err, backend_name=self.backend_name)
+
+        if not text or not text.strip():
+            return TTSResult(False, "待合成的文本为空", backend_name=self.backend_name)
+
+        server = self.get_config(ConfigKeys.COMFYUI_SERVER, "http://127.0.0.1:8188")
+        input_dir = self.get_config(
+            ConfigKeys.COMFYUI_INPUT_DIR,
+            "/Users/xenon/Downloads/seiun_tts/ComfyUI/ComfyUI/input",
+        )
+        timeout = int(self.get_config(ConfigKeys.COMFYUI_TIMEOUT, self.get_config(ConfigKeys.GENERAL_TIMEOUT, 60)))
+
+        audio_quality = self.get_config(ConfigKeys.COMFYUI_AUDIO_QUALITY, "128k")
+        mlx_python = self.get_config(
+            ConfigKeys.COMFYUI_MLX_PYTHON,
+            "/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/.venv/bin/python",
+        )
+        mlx_cli = self.get_config(
+            ConfigKeys.COMFYUI_MLX_CLI,
+            "/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/mlx_voice_clone_cli.py",
+        )
+
+        styles_raw = self.get_config(ConfigKeys.COMFYUI_STYLES, {})
+        styles = self._filter_styles_by_mode(self._normalize_styles_config(styles_raw))
+
+        style_name = (voice or self.get_default_voice() or "").strip() or "default"
+        if style_name not in styles:
+            # For split backends (voiceclone/customvoice), make "wrong style" errors explicit.
+            if (voice or "").strip() and self.allowed_modes:
+                return TTSResult(
+                    False,
+                    f"ComfyUI风格 '{style_name}' 不存在或不属于当前后端({self.backend_name})",
+                    backend_name=self.backend_name,
+                )
+            # Fallback order: "default" -> first available style.
+            if "default" in styles:
+                style_name = "default"
+            elif styles:
+                style_name = sorted(styles.keys())[0]
+            else:
+                return TTSResult(
+                    False,
+                    f"ComfyUI 未配置任何风格（{self.backend_name}）",
+                    backend_name=self.backend_name,
+                )
+        style = styles.get(style_name, {})
+
+        mode = (style.get("mode") or "voice_clone").strip()
+        if mode == "voice_clone":
+            refer_wav = style.get("refer_wav", "")
+            prompt_text = style.get("prompt_text", "")
+            if not refer_wav or not prompt_text:
+                return TTSResult(False, f"ComfyUI风格 '{style_name}' 配置不完整（voice_clone）", backend_name=self.backend_name)
+        elif mode == "custom_voice":
+            model_path = style.get("model_path", "")
+            speaker = style.get("speaker", "")
+            if not model_path or not speaker:
+                return TTSResult(False, f"ComfyUI风格 '{style_name}' 配置不完整（custom_voice）", backend_name=self.backend_name)
+        else:
+            return TTSResult(False, f"ComfyUI风格 '{style_name}' mode 无效: {mode}", backend_name=self.backend_name)
+
+        # Map language to the MLX node's language combo. Default to Auto.
+        detected = TTSTextUtils.detect_language(text)
+        language = style.get("language") or LANG_TO_DEMO.get(detected, "Auto")
+
+        # Sampling defaults match the MLX node defaults we exposed.
+        seed = int(style.get("seed", 0) or 0)
+        model_choice = str(style.get("model_choice", "1.7B") or "1.7B")
+        precision = str(style.get("precision", "bf16") or "bf16")
+        max_new_tokens = int(style.get("max_new_tokens", 2048) or 2048)
+        top_p = float(style.get("top_p", 0.8) or 0.8)
+        top_k = int(style.get("top_k", 20) or 20)
+        temperature = float(style.get("temperature", 1.0) or 1.0)
+        repetition_penalty = float(style.get("repetition_penalty", 1.05) or 1.05)
+
+        pause_cfg = {
+            "pause_linebreak": float(self.get_config(ConfigKeys.COMFYUI_PAUSE_LINEBREAK, 0.0)),
+            "period_pause": float(self.get_config(ConfigKeys.COMFYUI_PERIOD_PAUSE, 0.0)),
+            "comma_pause": float(self.get_config(ConfigKeys.COMFYUI_COMMA_PAUSE, 0.0)),
+            "question_pause": float(self.get_config(ConfigKeys.COMFYUI_QUESTION_PAUSE, 0.0)),
+            "hyphen_pause": float(self.get_config(ConfigKeys.COMFYUI_HYPHEN_PAUSE, 0.0)),
+        }
+        # Allow per-style override.
+        if isinstance(style.get("pause_cfg"), dict):
+            for k in pause_cfg.keys():
+                if k in style["pause_cfg"]:
+                    try:
+                        pause_cfg[k] = float(style["pause_cfg"][k])
+                    except Exception:
+                        pass
+
+        try:
+            if mode == "voice_clone":
+                ref_filename = self._ensure_ref_in_input(input_dir, style.get("refer_wav", ""))
+                prompt = self._build_prompt_voice_clone(
+                    ref_filename=ref_filename,
+                    ref_text=style.get("prompt_text", ""),
+                    target_text=text,
+                    language=language,
+                    model_choice=model_choice,
+                    precision=precision,
+                    seed=seed,
+                    max_new_tokens=max_new_tokens,
+                    top_p=top_p,
+                    top_k=top_k,
+                    temperature=temperature,
+                    repetition_penalty=repetition_penalty,
+                    audio_quality=audio_quality,
+                    mlx_python=mlx_python,
+                    mlx_cli=mlx_cli,
+                    pause_cfg=pause_cfg,
+                )
+            else:
+                # Allow per-style / automatic instruct inference.
+                instruct = str(style.get("instruct", "")).strip()
+                auto_style = bool(style.get("auto_instruct", False))
+                inferred = ""
+                if instruct == "__AUTO__" or (not instruct and auto_style):
+                    chat_stream = kwargs.get("chat_stream")
+                    chat_id = kwargs.get("chat_id")
+                    inferred = await self._infer_instruct(
+                        text=text,
+                        detected_lang=detected,
+                        chat_stream=chat_stream,
+                        chat_id=chat_id,
+                        style_name=style_name,
+                    )
+                if inferred:
+                    instruct = inferred
+
+                # If the instruct contains usable fields, map them to real controls.
+                max_chars = int(self.get_config(ConfigKeys.COMFYUI_AUTO_INSTRUCT_MAX_CHARS, 40) or 40)
+                instruct, mapped_speed, mapped_pause_cfg = self._apply_instruct_controls(
+                    instruct=instruct,
+                    speed=float(style.get("speed", 1.0) or 1.0),
+                    pause_cfg=pause_cfg,
+                    max_chars=max_chars,
+                )
+
+                prompt = self._build_prompt_custom_voice(
+                    target_text=text,
+                    speaker=str(style.get("speaker", "")).strip(),
+                    model_path=str(style.get("model_path", "")).strip(),
+                    instruct=instruct,
+                    speed=mapped_speed,
+                    language=language,
+                    seed=seed,
+                    max_new_tokens=max_new_tokens,
+                    top_p=top_p,
+                    top_k=top_k,
+                    temperature=temperature,
+                    repetition_penalty=repetition_penalty,
+                    audio_quality=audio_quality,
+                    mlx_python=mlx_python,
+                    mlx_cli=mlx_cli,
+                    pause_cfg=mapped_pause_cfg,
+                )
+
+            logger.info(f"{self.log_prefix} ComfyUI请求: text='{text[:50]}...', style={style_name}")
+            history_item = await self._queue_and_wait(server, prompt, timeout=timeout)
+            audio_bytes = await self._download_output_audio(server, history_item, timeout=timeout)
+
+            ok, msg = TTSFileManager.validate_audio_data(audio_bytes)
+            if not ok:
+                return TTSResult(False, f"ComfyUI 返回音频无效: {msg}", backend_name=self.backend_name)
+
+            return await self.send_audio(
+                audio_data=audio_bytes,
+                audio_format="mp3",
+                prefix="tts_comfyui",
+                voice_info=f"style: {style_name}",
+            )
+        except Exception as e:
+            return TTSResult(False, f"ComfyUI后端错误: {e}", backend_name=self.backend_name)
+
+
+class ComfyUIVoiceCloneBackend(ComfyUIBackend):
+    backend_name = "comfyui_voiceclone"
+    backend_description = "ComfyUI 工作流 API（MLX Qwen3-TTS VoiceClone 专用）"
+    allowed_modes = {"voice_clone"}
+
+    def get_default_voice(self) -> str:
+        v = self.get_config(ConfigKeys.COMFYUI_VOICECLONE_DEFAULT_STYLE, "") or ""
+        v = v.strip()
+        return v or super().get_default_voice()
+
+
+class ComfyUICustomVoiceBackend(ComfyUIBackend):
+    backend_name = "comfyui_customvoice"
+    backend_description = "ComfyUI 工作流 API（MLX Qwen3-TTS CustomVoice 专用）"
+    allowed_modes = {"custom_voice"}
+
+    def get_default_voice(self) -> str:
+        v = self.get_config(ConfigKeys.COMFYUI_CUSTOMVOICE_DEFAULT_STYLE, "") or ""
+        v = v.strip()
+        return v or super().get_default_voice()
diff --git a/backends/cosyvoice.py b/backends/cosyvoice.py
new file mode 100644
index 00000000..28199815
--- /dev/null
+++ b/backends/cosyvoice.py
@@ -0,0 +1,285 @@
+"""
+CosyVoice后端实现
+使用 ModelScope 的 Fun-CosyVoice3-0.5B Gradio API 进行语音合成
+"""
+
+import asyncio
+import os
+import shutil
+from typing import Optional, Tuple
+from .base import TTSBackendBase, TTSResult
+from ..utils.file import TTSFileManager
+from ..config_keys import ConfigKeys
+from src.common.logger import get_logger
+
+logger = get_logger("tts_cosyvoice")
+
+# CosyVoice指令映射表（方言、情感、语速等）
+COSYVOICE_INSTRUCT_MAP = {
+    # 方言
+    "广东话": "You are a helpful assistant. 请用广东话表达。<|endofprompt|>",
+    "东北话": "You are a helpful assistant. 请用东北话表达。<|endofprompt|>",
+    "甘肃话": "You are a helpful assistant. 请用甘肃话表达。<|endofprompt|>",
+    "贵州话": "You are a helpful assistant. 请用贵州话表达。<|endofprompt|>",
+    "河南话": "You are a helpful assistant. 请用河南话表达。<|endofprompt|>",
+    "湖北话": "You are a helpful assistant. 请用湖北话表达。<|endofprompt|>",
+    "湖南话": "You are a helpful assistant. 请用湖南话表达。<|endofprompt|>",
+    "江西话": "You are a helpful assistant. 请用江西话表达。<|endofprompt|>",
+    "闽南话": "You are a helpful assistant. 请用闽南话表达。<|endofprompt|>",
+    "宁夏话": "You are a helpful assistant. 请用宁夏话表达。<|endofprompt|>",
+    "山西话": "You are a helpful assistant. 请用山西话表达。<|endofprompt|>",
+    "陕西话": "You are a helpful assistant. 请用陕西话表达。<|endofprompt|>",
+    "山东话": "You are a helpful assistant. 请用山东话表达。<|endofprompt|>",
+    "上海话": "You are a helpful assistant. 请用上海话表达。<|endofprompt|>",
+    "四川话": "You are a helpful assistant. 请用四川话表达。<|endofprompt|>",
+    "天津话": "You are a helpful assistant. 请用天津话表达。<|endofprompt|>",
+    "云南话": "You are a helpful assistant. 请用云南话表达。<|endofprompt|>",
+
+    # 音量
+    "大声": "You are a helpful assistant. Please say a sentence as loudly as possible.<|endofprompt|>",
+    "小声": "You are a helpful assistant. Please say a sentence in a very soft voice.<|endofprompt|>",
+
+    # 语速
+    "慢速": "You are a helpful assistant. 请用尽可能慢地语速说一句话。<|endofprompt|>",
+    "快速": "You are a helpful assistant. 请用尽可能快地语速说一句话。<|endofprompt|>",
+
+    # 情感
+    "开心": "You are a helpful assistant. 请非常开心地说一句话。<|endofprompt|>",
+    "伤心": "You are a helpful assistant. 请非常伤心地说一句话。<|endofprompt|>",
+    "生气": "You are a helpful assistant. 请非常生气地说一句话。<|endofprompt|>",
+
+    # 特殊风格
+    "小猪佩奇": "You are a helpful assistant. 我想体验一下小猪佩奇风格，可以吗？<|endofprompt|>",
+    "机器人": "You are a helpful assistant. 你可以尝试用机器人的方式解答吗？<|endofprompt|>",
+}
+
+
+class CosyVoiceBackend(TTSBackendBase):
+    """
+    CosyVoice语音后端
+
+    使用 ModelScope 的 Fun-CosyVoice3-0.5B Gradio API 进行语音合成
+    支持3秒极速复刻、自然语言控制（方言、情感、语速等）
+    """
+
+    backend_name = "cosyvoice"
+    backend_description = "阿里云 CosyVoice3 API (ModelScope Gradio)"
+    support_private_chat = True
+    default_audio_format = "wav"
+
+    def get_default_voice(self) -> str:
+        """获取默认音色（CosyVoice 不需要预设音色）"""
+        return ""
+
+    def validate_config(self) -> Tuple[bool, str]:
+        """验证配置"""
+        gradio_url = self.get_config(ConfigKeys.COSYVOICE_GRADIO_URL, "")
+
+        if not gradio_url:
+            return False, "CosyVoice后端缺少必需的 gradio_url 配置"
+
+        return True, ""
+
+    def _resolve_instruct(self, emotion: Optional[str]) -> str:
+        """
+        解析情感参数为指令文本
+
+        Args:
+            emotion: 情感/方言关键词
+
+        Returns:
+            指令文本
+        """
+        if emotion and emotion in COSYVOICE_INSTRUCT_MAP:
+            return COSYVOICE_INSTRUCT_MAP[emotion]
+
+        # 返回默认指令（确保不为空）
+        default_instruct = self.get_config(
+            ConfigKeys.COSYVOICE_DEFAULT_INSTRUCT,
+            "You are a helpful assistant. 请用广东话表达。<|endofprompt|>"
+        )
+
+        # 如果配置为空，强制使用广东话
+        if not default_instruct or not default_instruct.strip():
+            default_instruct = "You are a helpful assistant. 请用广东话表达。<|endofprompt|>"
+
+        return default_instruct
+
+    async def execute(
+        self,
+        text: str,
+        voice: Optional[str] = None,
+        emotion: Optional[str] = None,
+        **kwargs
+    ) -> TTSResult:
+        """
+        执行 CosyVoice 语音合成
+
+        Args:
+            text: 待转换的文本
+            voice: 音色（对于CosyVoice，这个参数用于指定参考音频路径）
+            emotion: 情感/方言/语速参数
+
+        Returns:
+            TTSResult
+        """
+        # 验证配置
+        is_valid, error_msg = self.validate_config()
+        if not is_valid:
+            return TTSResult(False, error_msg, backend_name=self.backend_name)
+
+        # 验证文本
+        if not text or not text.strip():
+            return TTSResult(False, "待合成的文本为空", backend_name=self.backend_name)
+
+        # 获取配置
+        gradio_url = self.get_config(ConfigKeys.COSYVOICE_GRADIO_URL, "")
+        mode_config = self.get_config(ConfigKeys.COSYVOICE_DEFAULT_MODE, "3s极速复刻")
+
+        # mode_checkbox_group 实际上是 Radio 组件，期望字符串而不是列表
+        # 处理配置可能返回字符串或列表的情况
+        if isinstance(mode_config, list):
+            mode_str = mode_config[0] if mode_config else "3s极速复刻"
+        else:
+            mode_str = mode_config if mode_config else "3s极速复刻"
+
+        timeout = self.get_config(ConfigKeys.COSYVOICE_TIMEOUT, 60)
+        reference_audio = self.get_config(ConfigKeys.COSYVOICE_REFERENCE_AUDIO, "")
+        prompt_text = self.get_config(ConfigKeys.COSYVOICE_PROMPT_TEXT, "")
+
+        # CosyVoice 的"自然语言控制"模式实际上需要参考音频和 prompt_text
+        # 如果没有配置，使用默认的参考音频
+        if not reference_audio or not os.path.exists(reference_audio):
+            plugin_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+            default_audio = os.path.join(plugin_dir, "test.wav")
+            if os.path.exists(default_audio):
+                reference_audio = default_audio
+                logger.debug(f"{self.log_prefix} 使用默认参考音频: {reference_audio}")
+
+        # 如果没有 prompt_text，使用默认文本
+        if not prompt_text:
+            prompt_text = "大家好，我是嘉然，今天我来为大家朗读。"
+            logger.debug(f"{self.log_prefix} 使用默认 prompt_text")
+
+        # voice 参数可以覆盖配置文件中的参考音频
+        if voice and os.path.exists(voice):
+            reference_audio = voice
+
+        # 解析指令文本
+        instruct_text = self._resolve_instruct(emotion)
+
+        logger.info(
+            f"{self.log_prefix} CosyVoice请求: text='{text[:50]}...' "
+            f"(共{len(text)}字符), mode={mode_str}, instruct={emotion or '默认'}"
+        )
+
+        try:
+            # 动态导入 gradio_client（避免全局依赖）
+            try:
+                from gradio_client import Client, handle_file
+            except ImportError:
+                logger.error(f"{self.log_prefix} gradio_client 未安装，请运行: pip install gradio_client")
+                return TTSResult(
+                    False,
+                    "gradio_client 未安装，请运行: pip install gradio_client",
+                    backend_name=self.backend_name
+                )
+
+            # 创建 Gradio 客户端（设置超时）
+            try:
+                import httpx
+                httpx_kwargs = {"timeout": httpx.Timeout(timeout, read=timeout, write=timeout, connect=30.0)}
+                client = Client(gradio_url, httpx_kwargs=httpx_kwargs)
+            except Exception as e:
+                logger.warning(f"{self.log_prefix} 无法设置 httpx 超时，使用默认配置: {e}")
+                client = Client(gradio_url)
+
+            # 准备参数
+            logger.debug(f"{self.log_prefix} 准备参考音频: {reference_audio}")
+            prompt_wav_upload = handle_file(reference_audio) if reference_audio and os.path.exists(reference_audio) else None
+            logger.debug(f"{self.log_prefix} 参考音频准备完成")
+
+            # 调用 API
+            logger.info(f"{self.log_prefix} 调用 Gradio API: {gradio_url} (超时: {timeout}秒)")
+            logger.debug(f"{self.log_prefix} mode参数: {mode_str} (type: {type(mode_str).__name__})")
+            logger.debug(f"{self.log_prefix} prompt_text: {prompt_text[:50]}...")
+            logger.debug(f"{self.log_prefix} instruct_text: {instruct_text[:50]}...")
+
+            result = await asyncio.wait_for(
+                asyncio.to_thread(
+                    client.predict,
+                    tts_text=text,
+                    mode_checkbox_group=mode_str,
+                    prompt_text=prompt_text,
+                    prompt_wav_upload=prompt_wav_upload,
+                    prompt_wav_record=None,
+                    instruct_text=instruct_text,
+                    seed=0,
+                    stream=False,  # API 实际期望布尔值 False，虽然文档显示为 Literal['False']
+                    api_name="/generate_audio"
+                ),
+                timeout=timeout
+            )
+
+            logger.info(f"{self.log_prefix} CosyVoice API 响应成功")
+
+            # result 是生成的音频文件路径
+            if not result or not os.path.exists(result):
+                return TTSResult(
+                    False,
+                    f"CosyVoice 生成失败，未返回有效文件: {result}",
+                    backend_name=self.backend_name
+                )
+
+            # 读取音频数据
+            try:
+                with open(result, 'rb') as f:
+                    audio_data = f.read()
+            except Exception as e:
+                logger.error(f"{self.log_prefix} 读取音频文件失败: {e}")
+                return TTSResult(
+                    False,
+                    f"读取音频文件失败: {e}",
+                    backend_name=self.backend_name
+                )
+
+            # 验证音频数据
+            is_valid, error_msg = TTSFileManager.validate_audio_data(audio_data)
+            if not is_valid:
+                logger.warning(f"{self.log_prefix} CosyVoice音频数据验证失败: {error_msg}")
+                return TTSResult(
+                    False,
+                    f"CosyVoice语音{error_msg}",
+                    backend_name=self.backend_name
+                )
+
+            logger.debug(
+                f"{self.log_prefix} CosyVoice音频数据验证通过 "
+                f"(大小: {len(audio_data)}字节)"
+            )
+
+            # 使用统一的发送方法
+            audio_format = self.get_config(ConfigKeys.COSYVOICE_AUDIO_FORMAT, "wav")
+            voice_info = f"模式: {mode_str}, 指令: {emotion or '默认'}"
+
+            return await self.send_audio(
+                audio_data=audio_data,
+                audio_format=audio_format,
+                prefix="tts_cosyvoice",
+                voice_info=voice_info
+            )
+
+        except asyncio.TimeoutError:
+            logger.error(f"{self.log_prefix} CosyVoice API 请求超时 (配置超时: {timeout}秒)")
+            return TTSResult(
+                False,
+                "CosyVoice API 调用超时",
+                backend_name=self.backend_name
+            )
+        except Exception as e:
+            logger.error(f"{self.log_prefix} CosyVoice 执行异常: {e}")
+            return TTSResult(
+                False,
+                f"CosyVoice 执行错误: {e}",
+                backend_name=self.backend_name
+            )
diff --git a/backends/doubao.py b/backends/doubao.py
new file mode 100644
index 00000000..4c566abf
--- /dev/null
+++ b/backends/doubao.py
@@ -0,0 +1,230 @@
+"""
+豆包语音后端实现
+使用字节跳动豆包语音 API 进行语音合成
+"""
+
+import asyncio
+import uuid
+from typing import Optional, List, Dict, Tuple
+from .base import TTSBackendBase, TTSResult
+from .doubao_stream_parser import DoubaoStreamParser
+from ..utils.file import TTSFileManager
+from ..utils.session import TTSSessionManager
+from ..config_keys import ConfigKeys
+from src.common.logger import get_logger
+
+logger = get_logger("tts_doubao")
+
+# 豆包语音情感映射表（用于自动生成context_texts）
+DOUBAO_EMOTION_MAP = {
+    # 积极情绪
+    "开心": "你的语气再欢乐一点",
+    "兴奋": "用特别兴奋激动的语气说话",
+    "温柔": "用温柔体贴的语气说话",
+    "骄傲": "用骄傲的语气说话",
+    "自信": "用自信坚定的语气说话",
+
+    # 消极情绪
+    "生气": "你得跟我互怼！就是跟我用吵架的语气对话",
+    "愤怒": "用愤怒的语气说话",
+    "伤心": "用特别特别痛心的语气说话",
+    "失望": "用失望沮丧的语气说话",
+    "委屈": "用委屈的语气说话",
+
+    # 中性情绪
+    "平静": "用平静淡定的语气说话",
+    "严肃": "用严肃认真的语气说话",
+    "疑惑": "用疑惑不解的语气说话",
+
+    # 语速调整
+    "慢速": "说慢一点",
+    "快速": "说快一点",
+
+    # 音量调整
+    "小声": "你嗓门再小点",
+    "大声": "大声一点",
+}
+
+
+class DoubaoBackend(TTSBackendBase):
+    """
+    豆包语音后端
+
+    使用字节跳动豆包语音 API 进行高质量语音合成
+    支持预置音色和复刻音色
+    """
+
+    backend_name = "doubao"
+    backend_description = "字节跳动豆包语音API"
+    support_private_chat = True
+    default_audio_format = "mp3"
+
+    def get_default_voice(self) -> str:
+        """获取默认音色"""
+        return self.get_config(ConfigKeys.DOUBAO_DEFAULT_VOICE, "zh_female_shuangkuaisisi_moon_bigtts")
+
+    def validate_config(self) -> Tuple[bool, str]:
+        """验证配置"""
+        app_id = self.get_config(ConfigKeys.DOUBAO_APP_ID, "")
+        access_key = self.get_config(ConfigKeys.DOUBAO_ACCESS_KEY, "")
+        resource_id = self.get_config(ConfigKeys.DOUBAO_RESOURCE_ID, "")
+
+        if not app_id or not access_key or not resource_id:
+            return False, "豆包语音后端缺少必需的认证配置（app_id/access_key/resource_id）"
+
+        return True, ""
+
+    def _resolve_emotion(self, emotion: Optional[str]) -> Optional[List[str]]:
+        """
+        解析情感参数为 context_texts
+
+        Args:
+            emotion: 情感关键词
+
+        Returns:
+            context_texts 列表或 None
+        """
+        if emotion and emotion in DOUBAO_EMOTION_MAP:
+            return [DOUBAO_EMOTION_MAP[emotion]]
+        return None
+
+    async def execute(
+        self,
+        text: str,
+        voice: Optional[str] = None,
+        emotion: Optional[str] = None,
+        **kwargs
+    ) -> TTSResult:
+        """
+        执行豆包语音合成
+
+        Args:
+            text: 待转换的文本
+            voice: 音色ID
+            emotion: 情感/语气参数
+
+        Returns:
+            TTSResult
+        """
+        # 验证配置
+        is_valid, error_msg = self.validate_config()
+        if not is_valid:
+            return TTSResult(False, error_msg, backend_name=self.backend_name)
+
+        # 验证文本
+        if not text or not text.strip():
+            return TTSResult(False, "待合成的文本为空", backend_name=self.backend_name)
+
+        # 获取配置
+        api_url = self.get_config(ConfigKeys.DOUBAO_API_URL, "https://openspeech.bytedance.com/api/v3/tts/unidirectional")
+        app_id = self.get_config(ConfigKeys.DOUBAO_APP_ID, "")
+        access_key = self.get_config(ConfigKeys.DOUBAO_ACCESS_KEY, "")
+        resource_id = self.get_config(ConfigKeys.DOUBAO_RESOURCE_ID, "")
+        timeout = self.get_config(ConfigKeys.DOUBAO_TIMEOUT, 30)
+
+        if not voice:
+            voice = self.get_default_voice()
+
+        # 构建请求头
+        headers = {
+            "Content-Type": "application/json",
+            "X-Api-App-Id": app_id,
+            "X-Api-Access-Key": access_key,
+            "X-Api-Resource-Id": resource_id,
+            "X-Api-Request-Id": str(uuid.uuid4()),
+            "Accept-Encoding": "gzip, deflate"
+        }
+
+        # 构建请求体
+        request_data: Dict[str, any] = {
+            "req_params": {
+                "text": text,
+                "speaker": voice,
+                "audio_params": {
+                    "format": self.get_config(ConfigKeys.DOUBAO_AUDIO_FORMAT, "mp3"),
+                    "sample_rate": self.get_config(ConfigKeys.DOUBAO_SAMPLE_RATE, 24000),
+                    "bitrate": self.get_config(ConfigKeys.DOUBAO_BITRATE, 128000)
+                }
+            }
+        }
+
+        # 添加可选参数
+        speed = self.get_config(ConfigKeys.DOUBAO_SPEED, None)
+        if speed is not None:
+            request_data["req_params"]["speed"] = speed
+
+        volume = self.get_config(ConfigKeys.DOUBAO_VOLUME, None)
+        if volume is not None:
+            request_data["req_params"]["volume"] = volume
+
+        # 处理 context_texts
+        context_texts: Optional[List[str]] = None
+
+        # 优先使用传入的emotion参数
+        if emotion:
+            context_texts = self._resolve_emotion(emotion)
+            if context_texts:
+                logger.info(f"{self.log_prefix} 使用emotion参数: {emotion} -> {context_texts[0]}")
+
+        # 否则使用配置文件的默认值
+        if not context_texts:
+            context_texts = self.get_config(ConfigKeys.DOUBAO_CONTEXT_TEXTS, None)
+
+        if context_texts:
+            request_data["req_params"]["context_texts"] = context_texts
+
+        logger.info(f"{self.log_prefix} 豆包语音请求: text='{text[:50]}...' (共{len(text)}字符), voice={voice}")
+
+        try:
+            session_manager = await TTSSessionManager.get_instance()
+            async with session_manager.post(
+                api_url,
+                json=request_data,
+                headers=headers,
+                backend_name="doubao",
+                timeout=timeout
+            ) as response:
+                logger.info(f"{self.log_prefix} 豆包API响应状态码: {response.status}")
+
+                if response.status == 200:
+                    # 使用新的流式响应解析器
+                    audio_data, error_msg = await DoubaoStreamParser.parse_response(
+                        response,
+                        log_prefix=self.log_prefix
+                    )
+
+                    if error_msg:
+                        logger.error(f"{self.log_prefix} 豆包语音解析失败: {error_msg}")
+                        return TTSResult(False, error_msg, backend_name=self.backend_name)
+
+                    # 验证音频数据
+                    is_valid, error_msg = TTSFileManager.validate_audio_data(audio_data)
+                    if not is_valid:
+                        logger.warning(f"{self.log_prefix} 豆包音频数据验证失败: {error_msg}")
+                        return TTSResult(False, f"豆包语音{error_msg}", backend_name=self.backend_name)
+
+                    logger.debug(f"{self.log_prefix} 豆包音频数据验证通过 (大小: {len(audio_data)}字节)")
+
+                    # 使用统一的发送方法
+                    audio_format = self.get_config(ConfigKeys.DOUBAO_AUDIO_FORMAT, "mp3")
+                    return await self.send_audio(
+                        audio_data=audio_data,
+                        audio_format=audio_format,
+                        prefix="tts_doubao",
+                        voice_info=f"音色: {voice}"
+                    )
+                else:
+                    error_text = await response.text()
+                    logger.error(f"{self.log_prefix} 豆包API请求失败[{response.status}]: {error_text[:200]}")
+                    return TTSResult(
+                        False,
+                        f"豆包语音API调用失败: {response.status} - {error_text[:100]}",
+                        backend_name=self.backend_name
+                    )
+
+        except asyncio.TimeoutError:
+            logger.error(f"{self.log_prefix} 豆包API请求超时 (配置超时: {timeout}秒)")
+            return TTSResult(False, "豆包语音API调用超时", backend_name=self.backend_name)
+        except Exception as e:
+            logger.error(f"{self.log_prefix} 豆包语音执行异常: {e}")
+            return TTSResult(False, f"豆包语音执行错误: {e}", backend_name=self.backend_name)
diff --git a/backends/doubao_stream_parser.py b/backends/doubao_stream_parser.py
new file mode 100644
index 00000000..a3f61925
--- /dev/null
+++ b/backends/doubao_stream_parser.py
@@ -0,0 +1,432 @@
+"""
+豆包语音流式响应解析器
+基于官方示例实现，确保兼容性和正确性
+
+官方API说明：
+- code=0: 继续处理，可能包含 "data"（音频）或 "sentence"（文本）
+- code=20000000: 结束标志，可能包含 "usage"（用量统计）
+- code>0: 错误响应
+"""
+
+import json
+import base64
+from typing import Tuple, Optional, List
+from src.common.logger import get_logger
+
+logger = get_logger("doubao_stream_parser")
+
+
+class DoubaoStreamParser:
+    """
+    豆包语音流式响应解析器
+
+    基于官方API实现，忠实还原官方示例逻辑。
+    处理流程：
+    1. 逐行读取 JSON 响应
+    2. 检查状态码：code=0(继续), code=20000000(结束), code>0(错误)
+    3. 提取音频数据（code=0 且有 "data" 字段）
+    4. 记录日志（code=0 且有 "sentence" 字段）
+    """
+
+    def __init__(self, log_prefix: str = "[DoubaoParser]"):
+        """
+        初始化解析器
+
+        Args:
+            log_prefix: 日志前缀
+        """
+        self.log_prefix = log_prefix
+        self._audio_chunks: List[bytes] = []
+        self._buffer: bytes = b''
+        self._line_count: int = 0
+        self._total_bytes: int = 0
+        self._error_message: Optional[str] = None
+        self._finished: bool = False  # 是否收到结束信号
+        self._usage_info: Optional[dict] = None
+
+    def _decode_audio_from_base64(self, audio_base64: str) -> Optional[bytes]:
+        """
+        从 Base64 字符串解码音频数据
+
+        官方示例中直接使用 base64.b64decode(data["data"])，
+        但我们添加了额外的容错和验证。
+
+        Args:
+            audio_base64: Base64 编码的音频数据
+
+        Returns:
+            解码后的音频字节数据或 None
+        """
+        if not audio_base64:
+            return None
+
+        try:
+            # 官方示例直接调用 base64.b64decode()
+            # 这里添加容错处理：补充填充符（如果需要）
+            padding_needed = len(audio_base64) % 4
+            if padding_needed:
+                audio_base64 += '=' * (4 - padding_needed)
+                logger.debug(
+                    f"{self.log_prefix} Base64填充已应用 "
+                    f"(原长: {len(audio_base64) - (4 - padding_needed)}, 新长: {len(audio_base64)})"
+                )
+
+            audio_bytes = base64.b64decode(audio_base64)
+
+            if not audio_bytes:
+                logger.warning(f"{self.log_prefix} Base64解码结果为空")
+                return None
+
+            logger.debug(
+                f"{self.log_prefix} 音频块解码成功 - 大小: {len(audio_bytes)}字节"
+            )
+            return audio_bytes
+
+        except Exception as e:
+            logger.error(
+                f"{self.log_prefix} Base64解码失败: {e} "
+                f"(Base64长度: {len(audio_base64)})"
+            )
+            return None
+
+    def _process_json_line(self, line_str: str) -> Optional[str]:
+        """
+        处理单行 JSON 数据
+
+        严格按照官方示例逻辑：
+        1. 检查 code 字段
+        2. code=0 且有 data → 提取音频
+        3. code=0 且有 sentence → 记录文本（可选）
+        4. code=20000000 → 收到结束信号
+        5. code>0 → 错误
+
+        Args:
+            line_str: JSON 字符串
+
+        Returns:
+            如果收到结束信号，返回 "END"；如果发生错误，返回错误信息；否则返回 None
+        """
+        try:
+            json_obj = json.loads(line_str)
+        except json.JSONDecodeError as e:
+            logger.debug(f"{self.log_prefix} JSON解析失败: {e}")
+            return None
+        except Exception as e:
+            logger.warning(f"{self.log_prefix} JSON处理异常: {e}")
+            return None
+
+        if not isinstance(json_obj, dict):
+            logger.debug(
+                f"{self.log_prefix} 收到非字典JSON对象: {type(json_obj).__name__}"
+            )
+            return None
+
+        code = json_obj.get("code", -1)
+
+        # ✅ 官方逻辑：处理 code=0 的数据帧
+        if code == 0:
+            # 检查是否有音频数据
+            if "data" in json_obj and json_obj["data"]:
+                chunk_audio = self._decode_audio_from_base64(json_obj["data"])
+                if chunk_audio:
+                    self._audio_chunks.append(chunk_audio)
+                    logger.debug(
+                        f"{self.log_prefix} 音频块#{len(self._audio_chunks)} 已接收 "
+                        f"(大小: {len(chunk_audio)}字节)"
+                    )
+
+            # 检查是否有文本/句子信息（可选）
+            if "sentence" in json_obj and json_obj["sentence"]:
+                sentence_data = json_obj.get("sentence", {})
+                logger.debug(
+                    f"{self.log_prefix} 收到句子数据: {sentence_data}"
+                )
+
+            return None  # 继续处理
+
+        # ✅ 官方逻辑：处理 code=20000000 的结束帧
+        elif code == 20000000:
+            logger.info(f"{self.log_prefix} 收到流结束信号 (code=20000000)")
+
+            # 记录用量信息（如果有）
+            if "usage" in json_obj:
+                self._usage_info = json_obj["usage"]
+                logger.info(
+                    f"{self.log_prefix} 豆包用量信息: {self._usage_info}"
+                )
+
+            self._finished = True
+            return "END"  # 表示流已结束
+
+        # ✅ 官方逻辑：错误处理
+        elif code and code > 0:
+            error_msg = json_obj.get("message", f"未知错误 (code={code})")
+            logger.error(
+                f"{self.log_prefix} 豆包语音API返回错误 "
+                f"(code={code}): {error_msg}"
+            )
+            self._error_message = error_msg
+            return error_msg  # 返回错误信息
+
+        # 未知状态码
+        else:
+            logger.debug(
+                f"{self.log_prefix} 收到未知状态码: code={code}"
+            )
+            return None
+
+    def _find_data_chunk_offset(self, header: bytes) -> int:
+        """
+        在 WAV header 中查找 'data' 块的位置
+
+        豆包返回的 WAV 可能包含额外的元数据块（如 LIST/INFO），
+        导致 'data' 块不在标准的 44 字节位置。
+
+        Args:
+            header: WAV 文件头部数据
+
+        Returns:
+            data 块数据开始的位置（即 'data' + 4字节大小之后）
+        """
+        pos = 12  # 跳过 RIFF(4) + size(4) + WAVE(4)
+
+        while pos < len(header) - 8:
+            chunk_id = header[pos:pos+4]
+            chunk_size = int.from_bytes(header[pos+4:pos+8], 'little')
+
+            if chunk_id == b'data':
+                return pos + 8  # 返回音频数据开始位置
+
+            # 移动到下一个块
+            pos += 8 + chunk_size
+            # WAV 块需要对齐到偶数字节
+            if chunk_size % 2 == 1:
+                pos += 1
+
+        # 未找到 data 块，返回默认值
+        return 44
+
+    def _merge_audio_chunks(self, chunks: List[bytes]) -> bytes:
+        """
+        合并音频块，处理 WAV 格式的流式响应
+
+        豆包流式 WAV 响应特点：
+        1. 第一个块包含完整 header（可能 > 44 字节，含 LIST/INFO 元数据）
+        2. header 中的大小字段是 0xFFFFFFFF（流式占位符）
+        3. 后续块是纯音频数据（无 header）
+        4. 需要在合并后修正大小字段
+
+        Args:
+            chunks: 音频数据块列表
+
+        Returns:
+            合并后的有效 WAV 文件
+        """
+        if not chunks:
+            return b''
+
+        first_chunk = chunks[0]
+
+        # 检查是否是 WAV 格式（RIFF header）
+        if len(first_chunk) < 44 or first_chunk[:4] != b'RIFF':
+            # 不是 WAV 格式（如 MP3），直接拼接
+            return b''.join(chunks)
+
+        # 查找 data 块的实际位置
+        data_offset = self._find_data_chunk_offset(first_chunk)
+        logger.debug(f"{self.log_prefix} WAV data 块偏移: {data_offset} 字节")
+
+        # 提取 header 和第一块的音频数据
+        header = bytearray(first_chunk[:data_offset])
+        data_parts = [first_chunk[data_offset:]]
+        skipped_headers = 0
+
+        # 处理后续块
+        for chunk in chunks[1:]:
+            if len(chunk) > 44 and chunk[:4] == b'RIFF':
+                # 后续块也有 RIFF header，需要跳过
+                chunk_data_offset = self._find_data_chunk_offset(chunk)
+                data_parts.append(chunk[chunk_data_offset:])
+                skipped_headers += 1
+            else:
+                # 纯音频数据
+                data_parts.append(chunk)
+
+        # 合并所有音频数据
+        audio_data = b''.join(data_parts)
+        audio_size = len(audio_data)
+
+        # 修正 WAV header 中的大小字段
+        # 字节 4-7: 文件总大小 - 8 = (header_size - 8) + audio_size
+        file_size = len(header) - 8 + audio_size
+        header[4:8] = file_size.to_bytes(4, 'little')
+
+        # 修正 data 块的大小字段（位于 data_offset - 4 处）
+        header[data_offset-4:data_offset] = audio_size.to_bytes(4, 'little')
+
+        if skipped_headers > 0 or audio_size > 0:
+            logger.info(
+                f"{self.log_prefix} WAV 流式合并完成: "
+                f"header={len(header)}字节, 音频={audio_size}字节, "
+                f"跳过重复header={skipped_headers}"
+            )
+
+        return bytes(header) + audio_data
+
+    def feed_chunk(self, chunk: bytes) -> Optional[str]:
+        """
+        输入一块数据
+
+        Args:
+            chunk: 网络数据块
+
+        Returns:
+            如果遇到错误或结束，返回相应信息；否则返回 None
+        """
+        if not chunk:
+            return None
+
+        self._buffer += chunk
+        self._total_bytes += len(chunk)
+
+        # 按行处理（官方示例使用 iter_lines）
+        while b'\n' in self._buffer:
+            line_bytes, self._buffer = self._buffer.split(b'\n', 1)
+
+            # 尝试解码行数据
+            try:
+                line_str = line_bytes.decode('utf-8', errors='replace').strip()
+            except Exception as e:
+                logger.warning(
+                    f"{self.log_prefix} 行解码失败: {e}, 跳过该行"
+                )
+                self._line_count += 1
+                continue
+
+            if not line_str:
+                continue
+
+            self._line_count += 1
+
+            # 处理该行
+            result = self._process_json_line(line_str)
+
+            # 如果收到结束信号或错误，立即返回
+            if result == "END":
+                return None  # 正常结束
+            elif result:  # 返回的是错误信息
+                return result
+
+        return None
+
+    def finalize(self) -> Tuple[Optional[bytes], Optional[str]]:
+        """
+        完成解析，处理剩余数据
+
+        Returns:
+            (audio_data, error_message)
+            - audio_data: 合并后的音频数据（成功时）
+            - error_message: 错误信息（失败时）
+        """
+        # 处理剩余的 buffer 中的最后一行
+        if self._buffer.strip():
+            try:
+                line_str = self._buffer.decode('utf-8', errors='replace').strip()
+                if line_str:
+                    logger.debug(
+                        f"{self.log_prefix} 处理最后的buffer数据 "
+                        f"(长度: {len(line_str)}字符)"
+                    )
+                    result = self._process_json_line(line_str)
+                    if result and result != "END":
+                        # 最后的 buffer 包含错误
+                        self._error_message = result
+            except Exception as e:
+                logger.warning(
+                    f"{self.log_prefix} 最后buffer解析异常: {e}"
+                )
+
+        logger.info(
+            f"{self.log_prefix} 豆包流解析完成 - "
+            f"处理行数: {self._line_count}, "
+            f"音频块数: {len(self._audio_chunks)}, "
+            f"接收字节数: {self._total_bytes}, "
+            f"正常结束: {self._finished}"
+        )
+
+        # 检查是否有错误
+        if self._error_message:
+            logger.error(
+                f"{self.log_prefix} 豆包API返回错误: {self._error_message}"
+            )
+            return None, f"豆包语音API错误: {self._error_message}"
+
+        # 检查是否有音频数据
+        if not self._audio_chunks:
+            if self._total_bytes == 0:
+                logger.warning(
+                    f"{self.log_prefix} 豆包API未返回任何数据"
+                )
+                return None, "未收到任何响应数据"
+
+            logger.warning(
+                f"{self.log_prefix} 收到 {self._total_bytes} 字节数据但无音频块"
+            )
+            return None, "豆包语音未返回任何音频数据"
+
+        # ✅ 额外的数据完整性检查
+        # 过滤掉过小的块（可能是损坏或无效的）
+        min_chunk_size = 50  # 最小块大小
+        valid_chunks = [
+            chunk for chunk in self._audio_chunks
+            if len(chunk) >= min_chunk_size
+        ]
+
+        if not valid_chunks:
+            logger.error(
+                f"{self.log_prefix} 所有音频块都太小 (可能是损坏的数据)"
+            )
+            logger.debug(
+                f"{self.log_prefix} 块大小分布: {[len(c) for c in self._audio_chunks]}"
+            )
+            return None, "音频数据不完整或已损坏"
+
+        # 合并所有有效的音频数据（处理 WAV 多 header 问题）
+        merged_audio = self._merge_audio_chunks(valid_chunks)
+
+        logger.info(
+            f"{self.log_prefix} 音频合并完成 - "
+            f"有效块数: {len(valid_chunks)}/{len(self._audio_chunks)}, "
+            f"总大小: {len(merged_audio)}字节"
+        )
+
+        return merged_audio, None
+
+    @classmethod
+    async def parse_response(
+        cls,
+        response,
+        log_prefix: str = "[DoubaoParser]"
+    ) -> Tuple[Optional[bytes], Optional[str]]:
+        """
+        解析豆包 API 的流式响应
+
+        Args:
+            response: aiohttp 响应对象
+            log_prefix: 日志前缀
+
+        Returns:
+            (audio_data, error_message)
+        """
+        parser = cls(log_prefix)
+
+        # 逐块读取响应流
+        async for chunk in response.content.iter_any():
+            result = parser.feed_chunk(chunk)
+
+            # 如果遇到错误，立即返回
+            if result and result != "END":
+                return None, result
+
+        # 完成解析，处理剩余数据
+        return parser.finalize()
diff --git a/backends/gpt_sovits.py b/backends/gpt_sovits.py
new file mode 100644
index 00000000..126851ff
--- /dev/null
+++ b/backends/gpt_sovits.py
@@ -0,0 +1,326 @@
+"""
+GPT-SoVITS 后端实现
+使用本地 GPT-SoVITS 服务进行语音合成
+"""
+
+import asyncio
+from typing import Optional, Dict, Any, Tuple, ClassVar
+from .base import TTSBackendBase, TTSResult
+from ..utils.text import TTSTextUtils
+from ..utils.file import TTSFileManager
+from ..utils.session import TTSSessionManager
+from ..config_keys import ConfigKeys
+from src.common.logger import get_logger
+
+logger = get_logger("tts_gpt_sovits")
+
+
+class GPTSoVITSBackend(TTSBackendBase):
+    """
+    GPT-SoVITS 后端
+
+    使用本地 GPT-SoVITS 服务进行高度定制化的语音合成
+    支持动态切换 GPT 和 SoVITS 模型权重
+    """
+
+    backend_name = "gpt_sovits"
+    backend_description = "本地GPT-SoVITS服务"
+    support_private_chat = True
+    default_audio_format = "mp3"
+
+    # 类变量：记录当前加载的模型路径，避免重复切换
+    _current_gpt_weights: ClassVar[Optional[str]] = None
+    _current_sovits_weights: ClassVar[Optional[str]] = None
+
+    def get_default_voice(self) -> str:
+        """获取默认风格"""
+        return "default"
+
+    async def _switch_model(
+        self,
+        server: str,
+        gpt_weights: Optional[str],
+        sovits_weights: Optional[str],
+        timeout: int
+    ) -> Tuple[bool, str]:
+        """
+        切换 GPT-SoVITS 模型权重
+
+        Args:
+            server: 服务器地址
+            gpt_weights: GPT 模型权重路径
+            sovits_weights: SoVITS 模型权重路径
+            timeout: 超时时间
+
+        Returns:
+            (success, error_message)
+        """
+        session_manager = await TTSSessionManager.get_instance()
+
+        async def _set_model_v1() -> Tuple[bool, str]:
+            # 兼容旧版 api.py: 仅支持 /set_model 同时切换
+            if not gpt_weights or not sovits_weights:
+                return False, "当前GPT-SoVITS服务不支持单独切换模型（请同时配置GPT与SoVITS权重）"
+            set_model_url = (
+                f"{server.rstrip('/')}/set_model?"
+                f"gpt_model_path={gpt_weights}&sovits_model_path={sovits_weights}"
+            )
+            logger.info(f"{self.log_prefix} 切换模型(兼容模式): {gpt_weights} | {sovits_weights}")
+            try:
+                async with session_manager.get(
+                    set_model_url,
+                    backend_name="gpt_sovits",
+                    timeout=timeout
+                ) as response:
+                    if response.status == 200:
+                        GPTSoVITSBackend._current_gpt_weights = gpt_weights
+                        GPTSoVITSBackend._current_sovits_weights = sovits_weights
+                        logger.info(f"{self.log_prefix} 模型切换成功(兼容模式)")
+                        return True, ""
+                    error_text = await response.text()
+                    return False, f"模型切换失败: {error_text}"
+            except Exception as e:
+                return False, f"模型切换异常: {e}"
+
+        # 切换 GPT 权重
+        if gpt_weights and gpt_weights != GPTSoVITSBackend._current_gpt_weights:
+            gpt_url = f"{server.rstrip('/')}/set_gpt_weights?weights_path={gpt_weights}"
+            logger.info(f"{self.log_prefix} 切换GPT模型: {gpt_weights}")
+
+            try:
+                async with session_manager.get(
+                    gpt_url,
+                    backend_name="gpt_sovits",
+                    timeout=timeout
+                ) as response:
+                    if response.status == 200:
+                        GPTSoVITSBackend._current_gpt_weights = gpt_weights
+                        logger.info(f"{self.log_prefix} GPT模型切换成功")
+                    elif response.status == 404:
+                        # 旧版服务没有 /set_gpt_weights
+                        return await _set_model_v1()
+                    else:
+                        error_text = await response.text()
+                        return False, f"GPT模型切换失败: {error_text}"
+            except Exception as e:
+                return False, f"GPT模型切换异常: {e}"
+
+        # 切换 SoVITS 权重
+        if sovits_weights and sovits_weights != GPTSoVITSBackend._current_sovits_weights:
+            sovits_url = f"{server.rstrip('/')}/set_sovits_weights?weights_path={sovits_weights}"
+            logger.info(f"{self.log_prefix} 切换SoVITS模型: {sovits_weights}")
+
+            try:
+                async with session_manager.get(
+                    sovits_url,
+                    backend_name="gpt_sovits",
+                    timeout=timeout
+                ) as response:
+                    if response.status == 200:
+                        GPTSoVITSBackend._current_sovits_weights = sovits_weights
+                        logger.info(f"{self.log_prefix} SoVITS模型切换成功")
+                    elif response.status == 404:
+                        # 旧版服务没有 /set_sovits_weights
+                        return await _set_model_v1()
+                    else:
+                        error_text = await response.text()
+                        return False, f"SoVITS模型切换失败: {error_text}"
+            except Exception as e:
+                return False, f"SoVITS模型切换异常: {e}"
+
+        return True, ""
+
+    def _normalize_styles_config(self, styles_config: Any) -> Dict[str, Any]:
+        """
+        规范化风格配置格式
+
+        支持两种格式：
+        1. 旧格式（字典）: {"default": {...}, "happy": {...}}
+        2. 新格式（数组）: [{"name": "default", ...}, {"name": "happy", ...}]
+
+        统一转换为字典格式供内部使用
+        """
+        # 如果是字典格式（旧格式），直接返回
+        if isinstance(styles_config, dict):
+            return styles_config
+
+        # 如果是数组格式（新格式），转换为字典
+        if isinstance(styles_config, list):
+            result = {}
+            for style in styles_config:
+                if isinstance(style, dict) and "name" in style:
+                    style_name = style["name"]
+                    # 复制配置，移除 name 字段
+                    style_data = {k: v for k, v in style.items() if k != "name"}
+                    result[style_name] = style_data
+            return result
+
+        # 其他情况返回空字典
+        return {}
+
+    def validate_config(self) -> Tuple[bool, str]:
+        """验证配置"""
+        styles_raw = self.get_config(ConfigKeys.GPT_SOVITS_STYLES, {})
+        styles = self._normalize_styles_config(styles_raw)
+
+        if not styles or "default" not in styles:
+            return False, "GPT-SoVITS未配置任何语音风格"
+
+        default_style = styles.get("default", {})
+        if not default_style.get("refer_wav") or not default_style.get("prompt_text"):
+            return False, "GPT-SoVITS默认风格配置不完整（需要refer_wav和prompt_text）"
+
+        return True, ""
+
+    async def execute(
+        self,
+        text: str,
+        voice: Optional[str] = None,
+        **kwargs
+    ) -> TTSResult:
+        """
+        执行GPT-SoVITS语音合成
+
+        Args:
+            text: 待转换的文本
+            voice: 风格名称
+
+        Returns:
+            TTSResult
+        """
+        # 验证文本
+        if not text or not text.strip():
+            return TTSResult(False, "待合成的文本为空", backend_name=self.backend_name)
+
+        # 获取配置
+        server = self.get_config(ConfigKeys.GPT_SOVITS_SERVER, "http://127.0.0.1:9880")
+        styles_raw = self.get_config(ConfigKeys.GPT_SOVITS_STYLES, {})
+        styles = self._normalize_styles_config(styles_raw)
+        timeout = self.get_config(ConfigKeys.GENERAL_TIMEOUT, 60)
+
+        # 确定使用的风格
+        voice_style = voice if voice and voice in styles else "default"
+
+        if voice_style not in styles:
+            return TTSResult(
+                False,
+                f"GPT-SoVITS风格 '{voice_style}' 未配置",
+                backend_name=self.backend_name
+            )
+
+        style_config = styles[voice_style]
+        refer_wav_path = style_config.get("refer_wav", "")
+        prompt_text = style_config.get("prompt_text", "")
+        prompt_language = style_config.get("prompt_language", "zh")
+        gpt_weights = style_config.get("gpt_weights")
+        sovits_weights = style_config.get("sovits_weights")
+
+        if not refer_wav_path or not prompt_text:
+            return TTSResult(
+                False,
+                f"GPT-SoVITS风格 '{voice_style}' 配置不完整",
+                backend_name=self.backend_name
+            )
+
+        # 如果配置了模型权重，先切换模型
+        if gpt_weights or sovits_weights:
+            switch_success, switch_error = await self._switch_model(
+                server, gpt_weights, sovits_weights, timeout
+            )
+            if not switch_success:
+                return TTSResult(False, switch_error, backend_name=self.backend_name)
+
+        # 检测文本语言
+        text_language = TTSTextUtils.detect_language(text)
+
+        # 构建请求数据
+        data = {
+            "text": text,
+            "text_lang": text_language,
+            "ref_audio_path": refer_wav_path,
+            "prompt_text": prompt_text,
+            "prompt_lang": prompt_language
+        }
+
+        tts_url = f"{server.rstrip('/')}/tts"
+        legacy_tts_url = f"{server.rstrip('/')}/"
+        legacy_data = {
+            "text": text,
+            "text_language": text_language,
+            "refer_wav_path": refer_wav_path,
+            "prompt_text": prompt_text,
+            "prompt_language": prompt_language,
+        }
+
+        logger.info(f"{self.log_prefix} GPT-SoVITS请求: text='{text[:50]}...', style={voice_style}")
+
+        try:
+            session_manager = await TTSSessionManager.get_instance()
+            async with session_manager.post(
+                tts_url,
+                json=data,
+                backend_name="gpt_sovits",
+                timeout=timeout
+            ) as response:
+                if response.status == 200:
+                    audio_data = await response.read()
+
+                    # 验证音频数据
+                    is_valid, error_msg = TTSFileManager.validate_audio_data(audio_data)
+                    if not is_valid:
+                        return TTSResult(False, f"GPT-SoVITS{error_msg}", backend_name=self.backend_name)
+
+                    # 使用统一的发送方法
+                    return await self.send_audio(
+                        audio_data=audio_data,
+                        audio_format="wav",
+                        prefix="tts_gpt_sovits",
+                        voice_info=f"风格: {voice_style}"
+                    )
+                elif response.status == 404:
+                    # 兼容旧版 api.py：没有 /tts 端点，回退到根路径
+                    logger.warning(f"{self.log_prefix} /tts 端点不存在，尝试兼容模式请求根路径")
+                else:
+                    error_info = await response.text()
+                    logger.error(f"{self.log_prefix} GPT-SoVITS API失败[{response.status}]: {error_info[:200]}")
+                    return TTSResult(
+                        False,
+                        f"GPT-SoVITS API调用失败: {response.status}",
+                        backend_name=self.backend_name
+                    )
+
+            # 仅在 /tts 404 时回退到旧版根路径
+            async with session_manager.post(
+                legacy_tts_url,
+                json=legacy_data,
+                backend_name="gpt_sovits",
+                timeout=timeout
+            ) as response:
+                if response.status == 200:
+                    audio_data = await response.read()
+
+                    # 验证音频数据
+                    is_valid, error_msg = TTSFileManager.validate_audio_data(audio_data)
+                    if not is_valid:
+                        return TTSResult(False, f"GPT-SoVITS{error_msg}", backend_name=self.backend_name)
+
+                    return await self.send_audio(
+                        audio_data=audio_data,
+                        audio_format="wav",
+                        prefix="tts_gpt_sovits",
+                        voice_info=f"风格: {voice_style}"
+                    )
+                else:
+                    error_info = await response.text()
+                    logger.error(f"{self.log_prefix} GPT-SoVITS API失败[{response.status}]: {error_info[:200]}")
+                    return TTSResult(
+                        False,
+                        f"GPT-SoVITS API调用失败: {response.status}",
+                        backend_name=self.backend_name
+                    )
+
+        except asyncio.TimeoutError:
+            return TTSResult(False, "GPT-SoVITS API调用超时", backend_name=self.backend_name)
+        except Exception as e:
+            logger.error(f"{self.log_prefix} GPT-SoVITS执行错误: {e}")
+            return TTSResult(False, f"GPT-SoVITS执行错误: {e}", backend_name=self.backend_name)
diff --git a/backends/gsv2p.py b/backends/gsv2p.py
new file mode 100644
index 00000000..8837d881
--- /dev/null
+++ b/backends/gsv2p.py
@@ -0,0 +1,186 @@
+"""
+GSV2P 后端实现
+使用 GSV2P 云端 API 进行语音合成
+"""
+
+import asyncio
+import json
+from typing import Optional, Dict, Any, Tuple
+from .base import TTSBackendBase, TTSResult
+from ..utils.file import TTSFileManager
+from ..utils.session import TTSSessionManager
+from ..config_keys import ConfigKeys
+from src.common.logger import get_logger
+
+logger = get_logger("tts_gsv2p")
+
+# 重试配置
+MAX_RETRIES = 5  # 最大重试次数
+RETRY_DELAY = 3.0  # 重试间隔（秒）
+
+
+class GSV2PBackend(TTSBackendBase):
+    """
+    GSV2P 后端
+
+    使用 GSV2P 云端 API 进行高质量语音合成
+    """
+
+    backend_name = "gsv2p"
+    backend_description = "GSV2P云端API语音合成"
+    support_private_chat = True
+    default_audio_format = "mp3"
+
+    def get_default_voice(self) -> str:
+        """获取默认音色"""
+        return self.get_config(ConfigKeys.GSV2P_DEFAULT_VOICE, "原神-中文-派蒙_ZH")
+
+    def validate_config(self) -> Tuple[bool, str]:
+        """验证配置"""
+        api_token = self.get_config(ConfigKeys.GSV2P_API_TOKEN, "")
+        if not api_token:
+            return False, "GSV2P后端缺少API Token配置"
+        return True, ""
+
+    async def _make_request(
+        self,
+        api_url: str,
+        request_data: Dict[str, Any],
+        headers: Dict[str, str],
+        timeout: int
+    ) -> Tuple[bool, Any, str]:
+        """
+        发送单次API请求
+
+        Returns:
+            (成功标志, 音频数据或None, 错误信息)
+        """
+        session_manager = await TTSSessionManager.get_instance()
+        async with session_manager.post(
+            api_url,
+            json=request_data,
+            headers=headers,
+            backend_name="gsv2p",
+            timeout=timeout
+        ) as response:
+            if response.status == 200:
+                content_type = response.headers.get('Content-Type', '')
+                audio_data = await response.read()
+
+                # 检查是否返回了JSON错误（服务端不稳定时会返回参数错误）
+                if 'application/json' in content_type:
+                    try:
+                        error_json = json.loads(audio_data.decode('utf-8'))
+                        error_msg = error_json.get('error', {}).get('message', str(error_json))
+                        # 参数错误通常是服务端临时问题，可以重试
+                        return False, None, f"API返回错误: {error_msg}"
+                    except Exception:
+                        return False, None, "API返回异常响应"
+
+                # 验证音频数据
+                is_valid, error_msg = TTSFileManager.validate_audio_data(audio_data)
+                if not is_valid:
+                    return False, None, f"音频数据无效: {error_msg}"
+
+                return True, audio_data, ""
+            else:
+                error_text = await response.text()
+                return False, None, f"API调用失败: {response.status} - {error_text[:100]}"
+
+    async def execute(
+        self,
+        text: str,
+        voice: Optional[str] = None,
+        **kwargs
+    ) -> TTSResult:
+        """
+        执行GSV2P语音合成（带重试机制）
+
+        Args:
+            text: 待转换的文本
+            voice: 音色名称
+
+        Returns:
+            TTSResult
+        """
+        # 验证配置
+        is_valid, error_msg = self.validate_config()
+        if not is_valid:
+            return TTSResult(False, error_msg, backend_name=self.backend_name)
+
+        # 验证文本
+        if not text or not text.strip():
+            return TTSResult(False, "待合成的文本为空", backend_name=self.backend_name)
+
+        # 获取配置
+        api_url = self.get_config(ConfigKeys.GSV2P_API_URL, "https://gsv2p.acgnai.top/v1/audio/speech")
+        api_token = self.get_config(ConfigKeys.GSV2P_API_TOKEN, "")
+        timeout = self.get_config(ConfigKeys.GSV2P_TIMEOUT, 30)
+
+        if not voice:
+            voice = self.get_default_voice()
+
+        # 构建请求参数（注意：other_params 已被 API 废弃，不再支持）
+        request_data: Dict[str, Any] = {
+            "model": self.get_config(ConfigKeys.GSV2P_MODEL, "tts-v4"),
+            "input": text,
+            "voice": voice,
+            "response_format": self.get_config(ConfigKeys.GSV2P_RESPONSE_FORMAT, "mp3"),
+            "speed": self.get_config(ConfigKeys.GSV2P_SPEED, 1)
+        }
+
+        headers = {
+            "accept": "application/json",
+            "Authorization": f"Bearer {api_token}",
+            "Content-Type": "application/json"
+        }
+
+        logger.info(f"{self.log_prefix} GSV2P请求: text='{text[:50]}...', voice={voice}")
+        logger.debug(f"{self.log_prefix} GSV2P完整请求参数: {json.dumps(request_data, ensure_ascii=False, indent=2)}")
+
+        last_error = ""
+        for attempt in range(1, MAX_RETRIES + 1):
+            try:
+                success, audio_data, error_msg = await self._make_request(
+                    api_url, request_data, headers, timeout
+                )
+
+                if success and audio_data:
+                    if attempt > 1:
+                        logger.info(f"{self.log_prefix} GSV2P第{attempt}次重试成功")
+
+                    logger.info(f"{self.log_prefix} GSV2P响应: 数据大小={len(audio_data)}字节")
+
+                    # 使用统一的发送方法
+                    audio_format = self.get_config(ConfigKeys.GSV2P_RESPONSE_FORMAT, "mp3")
+                    return await self.send_audio(
+                        audio_data=audio_data,
+                        audio_format=audio_format,
+                        prefix="tts_gsv2p",
+                        voice_info=f"音色: {voice}"
+                    )
+                else:
+                    last_error = error_msg
+                    if attempt < MAX_RETRIES:
+                        logger.warning(f"{self.log_prefix} GSV2P请求失败 ({error_msg}), {RETRY_DELAY}秒后重试 (尝试 {attempt}/{MAX_RETRIES})")
+                        await asyncio.sleep(RETRY_DELAY)
+                    else:
+                        logger.error(f"{self.log_prefix} GSV2P请求失败，已达最大重试次数: {error_msg}")
+
+            except asyncio.TimeoutError:
+                last_error = "API调用超时"
+                if attempt < MAX_RETRIES:
+                    logger.warning(f"{self.log_prefix} GSV2P超时, {RETRY_DELAY}秒后重试 (尝试 {attempt}/{MAX_RETRIES})")
+                    await asyncio.sleep(RETRY_DELAY)
+                else:
+                    logger.error(f"{self.log_prefix} GSV2P超时，已达最大重试次数")
+
+            except Exception as e:
+                last_error = str(e)
+                logger.error(f"{self.log_prefix} GSV2P执行错误: {e}")
+                if attempt < MAX_RETRIES:
+                    await asyncio.sleep(RETRY_DELAY)
+                else:
+                    break
+
+        return TTSResult(False, f"GSV2P {last_error} (已重试{MAX_RETRIES}次)", backend_name=self.backend_name)
diff --git a/config.toml b/config.toml
new file mode 100644
index 00000000..9c045560
--- /dev/null
+++ b/config.toml
@@ -0,0 +1,292 @@
+# tts_voice_plugin - 自动生成的配置文件
+# 统一TTS语音合成插件，整合AI Voice、GSV2P、GPT-SoVITS、豆包语音、CosyVoice五种后端引擎，提供灵活的语音合成能力。
+
+# 插件基本配置
+[plugin]
+
+# 是否启用插件
+enabled = true
+
+# 配置文件版本
+config_version = "3.2.3"
+
+# 通用设置
+
+[general]
+
+# 默认TTS后端 (ai_voice/gsv2p/gpt_sovits/doubao/cosyvoice/comfyui)
+# 可选: ai_voice/gsv2p/gpt_sovits/doubao/cosyvoice/comfyui/comfyui_voiceclone/comfyui_customvoice
+default_backend = "comfyui_customvoice"
+
+# 请求超时时间（秒）
+timeout = 60
+
+# 最大文本长度（该限制会在调用LLM时注入到prompt中，让LLM直接生成符合长度的回复，而不是被动截断）
+max_text_length = 200
+
+# 是否使用replyer润色语音内容
+use_replyer_rewrite = true
+
+# 音频文件输出目录（支持相对路径和绝对路径，留空使用项目根目录）
+audio_output_dir = ""
+
+# 是否使用base64编码发送音频（备选方案）
+use_base64_audio = true
+
+# 是否分段发送语音（每句话单独发送一条语音，避免长语音播放问题）
+split_sentences = true
+
+# 分段发送时每条语音之间的延迟（秒）
+split_delay = 0.3
+
+# 自动分段启用阈值：文本长度小于该值时不分段（避免短句被切成多段）
+split_min_total_chars = 120
+
+# 句子最小长度：过短片段会合并到前一句（用于减少碎片段）
+split_min_sentence_chars = 6
+
+# 自动分段最大段数（避免刷屏式多段语音）。0 表示不限制。
+split_max_segments = 3
+
+# 自动分段打包目标长度（字符）。用于把多句合并成更少段。
+split_chunk_chars = 110
+
+# 是否发送错误提示消息（关闭后语音合成失败时不会发送错误信息给用户）
+send_error_messages = true
+
+# 组件启用控制
+
+[components]
+
+# 是否启用Action组件
+action_enabled = true
+
+# 是否启用Command组件
+command_enabled = true
+
+# 是否启用 instruct 调试命令组件（/tts_instruct）
+instruct_command_enabled = true
+
+# 概率控制配置
+
+[probability]
+
+# 是否启用概率控制
+enabled = true
+
+# 基础触发概率
+base_probability = 1
+
+# 关键词强制触发
+keyword_force_trigger = true
+
+# 强制触发关键词
+force_keywords = [
+    "一定要用语音",
+    "必须语音",
+    "语音回复我",
+    "务必用语音",
+]
+
+# AI Voice后端配置
+
+[ai_voice]
+
+# 默认音色（可选：小新、猴哥、四郎、东北老妹儿、广西大表哥、妲己、霸道总裁、酥心御姐、说书先生、憨憨小弟、憨厚老哥、吕布、元气少女、文艺少女、磁性大叔、邻家小妹、低沉男声、傲娇少女、爹系男友、暖心姐姐、温柔妹妹、书香少女）
+default_character = "邻家小妹"
+
+# GSV2P后端配置
+
+[gsv2p]
+
+# GSV2P API地址
+api_url = "https://gsv2p.acgnai.top/v1/audio/speech"
+
+# API认证Token
+api_token = ""
+
+# 默认音色
+default_voice = "原神-中文-派蒙_ZH"
+
+# API请求超时（秒）
+timeout = 149
+
+# TTS模型
+model = "tts-v4"
+
+# 音频格式
+response_format = "wav"
+
+# 语音速度
+speed = 1
+
+# GPT-SoVITS后端配置
+
+[gpt_sovits]
+
+# GPT-SoVITS服务地址
+server = "http://127.0.0.1:9880"
+
+# 语音风格配置
+
+# 豆包语音后端配置
+
+[[gpt_sovits.styles]]
+name = "default"
+refer_wav = "/Users/xenon/Downloads/seiun_tts/qingyun_tiankong_voice/s978ztt245c3jxms6apadwgna4e7hmb.mp3"
+prompt_text = "私にしてはがんばった方ではないでしょーか？"
+prompt_language = "ja"
+gpt_weights = "/Users/xenon/Downloads/GPT-SoVITS/GPT_weights_v4/seiun-e15.ckpt"
+sovits_weights = "/Users/xenon/Downloads/GPT-SoVITS/SoVITS_weights_v4/seiun_e2_s144_l32.pth"
+
+[[gpt_sovits.styles]]
+name = ""
+refer_wav = ""
+prompt_text = ""
+prompt_language = "zh"
+gpt_weights = ""
+sovits_weights = ""
+
+[doubao]
+
+# 豆包语音API地址
+api_url = "https://openspeech.bytedance.com/api/v3/tts/unidirectional"
+
+# 豆包APP ID
+app_id = ""
+
+# 豆包Access Key
+access_key = ""
+
+# 豆包Resource ID
+resource_id = "seed-tts-2.0"
+
+# 默认音色
+default_voice = "zh_female_vv_uranus_bigtts"
+
+# API请求超时（秒）
+timeout = 60
+
+# 音频格式
+audio_format = "wav"
+
+# 采样率
+sample_rate = 24000
+
+# 比特率
+bitrate = 128000
+
+# CosyVoice后端配置
+
+[cosyvoice]
+
+# Gradio API地址
+gradio_url = "https://funaudiollm-fun-cosyvoice3-0-5b.ms.show/"
+
+# 推理模式（3s极速复刻/自然语言控制）
+default_mode = "3s极速复刻"
+
+# 默认指令（用于自然语言控制模式）
+default_instruct = "You are a helpful assistant. 请用广东话表达。<|endofprompt|>"
+
+# 参考音频路径（用于3s极速复刻模式）
+reference_audio = ""
+
+# 提示文本（用于3s极速复刻模式）
+prompt_text = ""
+
+# API请求超时（秒）
+timeout = 300
+
+# 音频格式
+audio_format = "wav"
+
+[comfyui]
+server = "http://127.0.0.1:8188"
+# 必须是 ComfyUI 的 input 目录, backend 会把 refer_wav 复制进去, 再用 LoadAudio 读取
+input_dir = "/Users/xenon/Downloads/seiun_tts/ComfyUI/ComfyUI/input"
+timeout = 120
+audio_quality = "128k" # SaveAudioMP3: V0/128k/320k
+mlx_python = "/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/.venv/bin/python"
+mlx_cli = "/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/mlx_voice_clone_cli.py"
+default_style = "default"
+# Split comfyui backend into two convenient aliases:
+# - comfyui_voiceclone: only uses styles whose mode is voice_clone (or absent)
+# - comfyui_customvoice: only uses styles whose mode is custom_voice
+# These keys let you pick different defaults without duplicating comfyui.styles.
+voiceclone_default_style = "default"
+customvoice_default_style = "seiun"
+auto_instruct_enabled = true
+auto_instruct_max_chars = 320
+
+# 自动推断 instruct 时固定附加的“基调”(persona)。会作为 `基调=...;` 前缀插入。
+# 注意：值里不要包含 ';' 或 '='（backend 会做清洗，但建议从源头避免）。
+auto_instruct_base_tone = "女性约15-16岁,清澈透亮但慵懒的轻女高音,句尾元音随意拉长且略带鼻腔撒娇,咬字松弛像刚睡醒,可在慵懒与冷静锐利间切换,带戏谑亲和"
+
+# 可选：完整基调原文（保留备份，当前不启用）
+# auto_instruct_base_tone = """
+# 女性，外表约15-16岁，音色是清澈透亮却带有慵懒感的轻女高音（Light Soprano）。
+#
+# 嗓音轻盈飘逸，带有明显的“云朵般”的漂浮感，起初是漫不经心的拖沓语调，其特征在于句尾元音的随意拉长（Drawl）以及略带鼻腔共鸣的撒娇感。咬字呈现出一种仿佛刚睡醒般的松弛，甚至伴有刻意为之的含糊，像是一只在阳光下伸懒腰的猫。
+#
+# 随后，这种慵懒被一种狡黠的机敏所取代，声音在毫无干劲的叹息与看穿一切的通透感之间自如切换。在表现谋略或胜负欲的瞬间，音色会瞬间收紧，去除了所有的气声装饰与慵懒拖音，转为冷静、干练且直击要害的中高频。
+#
+# 表现风格既显得捉摸不透又带有戏谑的亲和力，伴随着轻巧的换气声和偶尔出现的、带有试探意味的升调尾音。仿佛在脱力系（Listless）的无害表象之下，潜藏着绝顶聪明的头脑与绝不让步的自尊。
+# """
+
+auto_instruct_prompt = """
+你是精通声学特征与戏剧表演的 AI 配音导演。你的任务是根据「待朗读文本」生成一行 TTS instruct（用于 Qwen3-TTS CustomVoice 的语音表演控制）。
+
+硬性要求：
+- 只输出一行（单行 KV），不要解释，不要引号/代码块，不要复述原文。
+- 必须同时包含以下字段，并用英文分号 ';' 分隔：情绪、强度、语速、停顿、表现
+- 输出格式固定为：情绪=<...>;强度=<...>;语速=<...>;停顿=<...>;表现=<...>
+- 语速可选：很慢/稍慢/正常/稍快/很快
+- 停顿可选：很少/自然/稍多/很多
+- 强度可选：很低/低/中/高/很高
+- 表现：用 3-6 个短提示词，使用逗号分隔（不要用分号），如：声压高,咬字重,重音强,尾音下压
+- 长度 <= {max_chars} 字
+
+强制增强规则（避免“生气但听起来不够生气”）：
+- 如果文本出现：非常/极其/真的/气死/怒/吼/滚/闭嘴/你再说一次 等强烈信号，情绪优先用「愤怒」，强度至少「高」，表现要包含“声压高/咬字重/重音强/尾音下压”中的至少 2 项。
+- 如果是嘲讽或冷笑式的怒气：情绪写「愤怒(冷)」或「愤怒+嘲讽」，表现包含“冷硬/压低/咬字利落/少气声”。
+
+文本语言: {lang}
+待朗读文本: {text}
+"""
+
+# 基础停顿（秒）。当 instruct 包含“停顿=...”时，会按 很少/自然/稍多/很多 做倍率缩放。
+pause_linebreak = 0.18
+period_pause = 0.22
+comma_pause = 0.1
+question_pause = 0.2
+hyphen_pause = 0.06
+
+[[comfyui.styles]]
+name = "default"
+refer_wav = "/Users/xenon/Downloads/seiun_tts/qingyun_tiankong_voice/default_ref_24k_mono.wav"
+prompt_text = "私にしてはがんばった方ではないでしょーか？"
+language = "Auto"
+model_choice = "1.7B"
+precision = "bf16"
+seed = 0
+max_new_tokens = 2048
+top_p = 0.8
+top_k = 20
+temperature = 1
+repetition_penalty = 1.05
+
+[[comfyui.styles]]
+name = "seiun"
+mode = "custom_voice"
+model_path = "/Users/xenon/Downloads/checkpoint-epoch-9"
+speaker = "seiun"
+instruct = "__AUTO__"
+speed = 1
+language = "Auto"
+seed = 0
+max_new_tokens = 2048
+top_p = 0.9
+top_k = 20
+temperature = 0.9
+repetition_penalty = 1.05
diff --git a/config_keys.py b/config_keys.py
new file mode 100644
index 00000000..b7993eca
--- /dev/null
+++ b/config_keys.py
@@ -0,0 +1,103 @@
+"""
+配置键常量定义
+集中管理所有配置键，避免硬编码
+"""
+
+
+class ConfigKeys:
+    """配置键常量类"""
+
+    # ========== Plugin 配置 ==========
+    PLUGIN_ENABLED = "plugin.enabled"
+    PLUGIN_CONFIG_VERSION = "plugin.config_version"
+
+    # ========== General 通用配置 ==========
+    GENERAL_DEFAULT_BACKEND = "general.default_backend"
+    GENERAL_TIMEOUT = "general.timeout"
+    GENERAL_MAX_TEXT_LENGTH = "general.max_text_length"
+    GENERAL_USE_REPLYER_REWRITE = "general.use_replyer_rewrite"
+    GENERAL_AUDIO_OUTPUT_DIR = "general.audio_output_dir"
+    GENERAL_USE_BASE64_AUDIO = "general.use_base64_audio"
+    GENERAL_SPLIT_SENTENCES = "general.split_sentences"
+    GENERAL_SPLIT_DELAY = "general.split_delay"
+    GENERAL_SPLIT_MIN_TOTAL_CHARS = "general.split_min_total_chars"
+    GENERAL_SPLIT_MIN_SENTENCE_CHARS = "general.split_min_sentence_chars"
+    GENERAL_SPLIT_MAX_SEGMENTS = "general.split_max_segments"
+    GENERAL_SPLIT_CHUNK_CHARS = "general.split_chunk_chars"
+    GENERAL_SEND_ERROR_MESSAGES = "general.send_error_messages"
+
+    # ========== Components 组件配置 ==========
+    COMPONENTS_ACTION_ENABLED = "components.action_enabled"
+    COMPONENTS_COMMAND_ENABLED = "components.command_enabled"
+    COMPONENTS_INSTRUCT_COMMAND_ENABLED = "components.instruct_command_enabled"
+
+    # ========== Probability 概率控制配置 ==========
+    PROBABILITY_ENABLED = "probability.enabled"
+    PROBABILITY_BASE_PROBABILITY = "probability.base_probability"
+    PROBABILITY_KEYWORD_FORCE_TRIGGER = "probability.keyword_force_trigger"
+    PROBABILITY_FORCE_KEYWORDS = "probability.force_keywords"
+
+    # ========== AI Voice 配置 ==========
+    AI_VOICE_DEFAULT_CHARACTER = "ai_voice.default_character"
+    AI_VOICE_ALIAS_MAP = "ai_voice.alias_map"
+
+    # ========== GSV2P 配置 ==========
+    GSV2P_API_URL = "gsv2p.api_url"
+    GSV2P_API_TOKEN = "gsv2p.api_token"
+    GSV2P_DEFAULT_VOICE = "gsv2p.default_voice"
+    GSV2P_TIMEOUT = "gsv2p.timeout"
+    GSV2P_MODEL = "gsv2p.model"
+    GSV2P_RESPONSE_FORMAT = "gsv2p.response_format"
+    GSV2P_SPEED = "gsv2p.speed"
+
+    # ========== GPT-SoVITS 配置 ==========
+    GPT_SOVITS_SERVER = "gpt_sovits.server"
+    GPT_SOVITS_STYLES = "gpt_sovits.styles"
+
+    # ========== Doubao 豆包配置 ==========
+    DOUBAO_API_URL = "doubao.api_url"
+    DOUBAO_APP_ID = "doubao.app_id"
+    DOUBAO_ACCESS_KEY = "doubao.access_key"
+    DOUBAO_RESOURCE_ID = "doubao.resource_id"
+    DOUBAO_DEFAULT_VOICE = "doubao.default_voice"
+    DOUBAO_TIMEOUT = "doubao.timeout"
+    DOUBAO_AUDIO_FORMAT = "doubao.audio_format"
+    DOUBAO_SAMPLE_RATE = "doubao.sample_rate"
+    DOUBAO_BITRATE = "doubao.bitrate"
+    DOUBAO_SPEED = "doubao.speed"
+    DOUBAO_VOLUME = "doubao.volume"
+    DOUBAO_CONTEXT_TEXTS = "doubao.context_texts"
+
+    # ========== CosyVoice 配置 ==========
+    COSYVOICE_GRADIO_URL = "cosyvoice.gradio_url"
+    COSYVOICE_DEFAULT_MODE = "cosyvoice.default_mode"
+    COSYVOICE_DEFAULT_INSTRUCT = "cosyvoice.default_instruct"
+    COSYVOICE_REFERENCE_AUDIO = "cosyvoice.reference_audio"
+    COSYVOICE_PROMPT_TEXT = "cosyvoice.prompt_text"
+    COSYVOICE_TIMEOUT = "cosyvoice.timeout"
+    COSYVOICE_AUDIO_FORMAT = "cosyvoice.audio_format"
+
+    # ========== ComfyUI (Workflow API) 配置 ==========
+    COMFYUI_SERVER = "comfyui.server"
+    COMFYUI_INPUT_DIR = "comfyui.input_dir"
+    COMFYUI_TIMEOUT = "comfyui.timeout"
+    COMFYUI_DEFAULT_STYLE = "comfyui.default_style"
+    COMFYUI_STYLES = "comfyui.styles"
+    # Convenience aliases to split voiceclone/customvoice at the plugin level.
+    # Both backends still use comfyui.styles, but these keys let you pick different defaults.
+    COMFYUI_VOICECLONE_DEFAULT_STYLE = "comfyui.voiceclone_default_style"
+    COMFYUI_CUSTOMVOICE_DEFAULT_STYLE = "comfyui.customvoice_default_style"
+    COMFYUI_AUDIO_QUALITY = "comfyui.audio_quality"
+    COMFYUI_MLX_PYTHON = "comfyui.mlx_python"
+    COMFYUI_MLX_CLI = "comfyui.mlx_cli"
+    COMFYUI_PAUSE_LINEBREAK = "comfyui.pause_linebreak"
+    COMFYUI_PERIOD_PAUSE = "comfyui.period_pause"
+    COMFYUI_COMMA_PAUSE = "comfyui.comma_pause"
+    COMFYUI_QUESTION_PAUSE = "comfyui.question_pause"
+    COMFYUI_HYPHEN_PAUSE = "comfyui.hyphen_pause"
+
+    # Auto instruct (CustomVoice)
+    COMFYUI_AUTO_INSTRUCT_ENABLED = "comfyui.auto_instruct_enabled"
+    COMFYUI_AUTO_INSTRUCT_BASE_TONE = "comfyui.auto_instruct_base_tone"
+    COMFYUI_AUTO_INSTRUCT_PROMPT = "comfyui.auto_instruct_prompt"
+    COMFYUI_AUTO_INSTRUCT_MAX_CHARS = "comfyui.auto_instruct_max_chars"
diff --git a/plugin.py b/plugin.py
new file mode 100644
index 00000000..8ee2b155
--- /dev/null
+++ b/plugin.py
@@ -0,0 +1,972 @@
+"""
+统一TTS语音合成插件
+支持五种后端：AI Voice (MaiCore内置) / GSV2P (云API) / GPT-SoVITS (本地服务) / 豆包语音 (云API) / CosyVoice (ModelScope Gradio)
+
+Version: 3.2.3
+Author: 靓仔
+"""
+
+import sys
+sys.dont_write_bytecode = True
+
+import asyncio
+import random
+from typing import List, Tuple, Type, Optional
+
+from src.common.logger import get_logger
+from src.plugin_system.base.base_plugin import BasePlugin
+from src.plugin_system.apis.plugin_register_api import register_plugin
+from src.plugin_system.base.base_action import BaseAction, ActionActivationType
+from src.plugin_system.base.base_command import BaseCommand
+from src.plugin_system.base.component_types import ComponentInfo, ChatMode
+from src.plugin_system.base.config_types import ConfigField
+from src.plugin_system.apis import generator_api
+
+# 导入模块化的后端和工具
+from .backends import TTSBackendRegistry, TTSResult
+from .backends.ai_voice import AI_VOICE_ALIAS_MAP
+from .backends.doubao import DOUBAO_EMOTION_MAP
+from .utils.text import TTSTextUtils
+from .config_keys import ConfigKeys
+
+logger = get_logger("tts_voice_plugin")
+
+# 有效后端列表
+VALID_BACKENDS = [
+    "ai_voice",
+    "gsv2p",
+    "gpt_sovits",
+    "doubao",
+    "cosyvoice",
+    "comfyui",
+    "comfyui_voiceclone",
+    "comfyui_customvoice",
+]
+
+
+class TTSExecutorMixin:
+    """
+    TTS执行器混入类
+
+    提供 Action 和 Command 共享的后端执行逻辑
+    """
+
+    def _create_backend(self, backend_name: str):
+        """
+        创建后端实例
+
+        Args:
+            backend_name: 后端名称
+
+        Returns:
+            后端实例
+        """
+        backend = TTSBackendRegistry.create(
+            backend_name,
+            self.get_config,
+            self.log_prefix
+        )
+
+        if backend:
+            # 注入必要的回调函数
+            if hasattr(backend, 'set_send_custom'):
+                backend.set_send_custom(self.send_custom)
+            if hasattr(backend, 'set_send_command'):
+                backend.set_send_command(self.send_command)
+
+        return backend
+
+    async def _execute_backend(
+        self,
+        backend_name: str,
+        text: str,
+        voice: str = "",
+        emotion: str = ""
+    ) -> TTSResult:
+        """
+        执行指定后端
+
+        Args:
+            backend_name: 后端名称
+            text: 待转换文本
+            voice: 音色
+            emotion: 情感（豆包后端）
+
+        Returns:
+            TTSResult
+        """
+        backend = self._create_backend(backend_name)
+
+        if not backend:
+            return TTSResult(
+                success=False,
+                message=f"未知的TTS后端: {backend_name}"
+            )
+
+        # AI Voice 私聊限制检查
+        if backend_name == "ai_voice":
+            is_private = self._check_is_private_chat()
+            if is_private:
+                logger.info(f"{self.log_prefix} AI语音仅支持群聊，自动切换到GSV2P后端")
+                return await self._execute_backend("gsv2p", text, voice, emotion)
+
+        # Pass chat context through for backends that need MaiBot LLM APIs (e.g., comfyui auto_instruct).
+        chat_stream = None
+        if hasattr(self, "chat_stream"):
+            chat_stream = getattr(self, "chat_stream", None)
+        elif hasattr(self, "message"):
+            chat_stream = getattr(getattr(self, "message", None), "chat_stream", None)
+
+        return await backend.execute(text, voice, emotion=emotion, chat_stream=chat_stream)
+
+    def _check_is_private_chat(self) -> bool:
+        """检查是否是私聊"""
+        # Action 中使用 chat_stream
+        if hasattr(self, 'chat_stream'):
+            return not getattr(self.chat_stream, 'group_info', None)
+        # Command 中使用 message
+        if hasattr(self, 'message'):
+            msg_info = getattr(self.message, 'message_info', None)
+            if msg_info:
+                return not getattr(msg_info, 'group_info', None)
+        return False
+
+    def _get_default_backend(self) -> str:
+        """获取配置的默认后端"""
+        backend = self.get_config(ConfigKeys.GENERAL_DEFAULT_BACKEND, "gsv2p")
+        if backend not in VALID_BACKENDS:
+            logger.warning(f"{self.log_prefix} 配置的默认后端 '{backend}' 无效，使用 gsv2p")
+            return "gsv2p"
+        return backend
+
+    async def _send_error(self, message: str) -> None:
+        """
+        发送错误提示信息（受全局配置控制）
+
+        Args:
+            message: 错误消息
+        """
+        if self.get_config(ConfigKeys.GENERAL_SEND_ERROR_MESSAGES, True):
+            await self.send_text(message)
+
+
+class UnifiedTTSAction(BaseAction, TTSExecutorMixin):
+    """统一TTS Action - LLM自动触发"""
+
+    action_name = "unified_tts_action"
+    action_description = "用语音回复（支持AI Voice/GSV2P/GPT-SoVITS/豆包语音多后端）"
+    activation_type = ActionActivationType.KEYWORD
+    mode_enable = ChatMode.ALL
+    parallel_action = False
+
+    activation_keywords = [
+        "语音", "说话", "朗读", "念一下", "读出来",
+        "voice", "speak", "tts", "语音回复", "用语音说", "播报"
+    ]
+    keyword_case_sensitive = False
+
+    action_parameters = {
+        "text": "要转换为语音的文本内容（必填）",
+        "backend": "TTS后端引擎 (ai_voice/gsv2p/gpt_sovits/doubao/cosyvoice/comfyui/comfyui_voiceclone/comfyui_customvoice，可选，建议省略让系统自动使用配置的默认后端)",
+        "voice": "音色/风格参数（可选）",
+        "emotion": "情感/语气参数（可选，仅豆包后端有效）。支持：开心/兴奋/温柔/骄傲/生气/愤怒/伤心/失望/委屈/平静/严肃/疑惑/慢速/快速/小声/大声等"
+    }
+
+    action_require = [
+        "当用户要求用语音回复时使用",
+        "当回复简短问候语时使用（如早上好、晚安、你好等）",
+        "当想让回复更活泼生动时可以使用",
+        "注意：回复内容过长或者过短不适合用语音",
+        "注意：backend参数建议省略，系统会自动使用配置的默认后端"
+    ]
+
+    associated_types = ["text", "command"]
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.timeout = self.get_config(ConfigKeys.GENERAL_TIMEOUT, 60)
+        self.max_text_length = self.get_config(ConfigKeys.GENERAL_MAX_TEXT_LENGTH, 500)
+
+    def _check_force_trigger(self, text: str) -> bool:
+        """检查是否强制触发"""
+        if not self.get_config(ConfigKeys.PROBABILITY_KEYWORD_FORCE_TRIGGER, True):
+            return False
+        force_keywords = self.get_config(
+            ConfigKeys.PROBABILITY_FORCE_KEYWORDS,
+            ["一定要用语音", "必须语音", "语音回复我", "务必用语音"]
+        )
+        return any(kw in text for kw in force_keywords)
+
+    def _probability_check(self, text: str) -> bool:
+        """概率控制检查"""
+        if not self.get_config(ConfigKeys.PROBABILITY_ENABLED, True):
+            return True
+
+        base_prob = self.get_config(ConfigKeys.PROBABILITY_BASE_PROBABILITY, 1.0)
+        base_prob = max(0.0, min(1.0, base_prob))
+        result = random.random() < base_prob
+        logger.info(f"{self.log_prefix} 概率检查: {base_prob:.2f}, 结果={'通过' if result else '未通过'}")
+        return result
+
+    async def _get_final_text(self, raw_text: str, reason: str, use_replyer: bool) -> Tuple[bool, str]:
+        """获取最终要转语音的文本（使用与正常回复一致的prompt参数）"""
+        max_text_length = self.get_config(ConfigKeys.GENERAL_MAX_TEXT_LENGTH, 200)
+
+        if not use_replyer:
+            if not raw_text:
+                return False, ""
+            return True, raw_text
+
+        try:
+            # 统一使用 generate_reply 以确保触发 POST_LLM 事件（日程注入）
+            # rewrite_reply 不会触发 POST_LLM 事件，因此不适用
+            # 注意：长度约束放在末尾，利用 LLM 的"近因效应"提高遵守率
+            extra_info_parts = []
+            if raw_text:
+                extra_info_parts.append(f"期望的回复内容：{raw_text}")
+            # 长度约束放在最后，使用更强的表述
+            extra_info_parts.append(
+                f"【重要】你的回复必须控制在{max_text_length}字以内，这是硬性要求。"
+                f"超过此长度将无法转换为语音。请直接回复核心内容，不要啰嗦。"
+            )
+
+            success, llm_response = await generator_api.generate_reply(
+                chat_stream=self.chat_stream,
+                reply_message=self.action_message,
+                reply_reason=reason,
+                extra_info="\n".join(extra_info_parts),
+                request_type="tts_voice_plugin",
+                from_plugin=False  # 允许触发POST_LLM事件，使日程注入生效
+            )
+            if success and llm_response and llm_response.content:
+                logger.info(f"{self.log_prefix} 语音内容生成成功")
+                return True, llm_response.content.strip()
+
+            # 如果生成失败但有原始文本，则使用原始文本
+            if raw_text:
+                logger.warning(f"{self.log_prefix} 内容生成失败，使用原始文本")
+                return True, raw_text
+
+            return False, ""
+        except Exception as e:
+            logger.error(f"{self.log_prefix} 调用 replyer 出错: {e}")
+            return bool(raw_text), raw_text
+
+    async def execute(self) -> Tuple[bool, str]:
+        def _chunk_sentences(
+            parts: List[str], target_chars: int, max_chunks: int
+        ) -> List[str]:
+            # Greedy packing: reduces tiny fragments into fewer, longer segments.
+            if not parts:
+                return []
+            if target_chars <= 0:
+                target_chars = 120
+
+            def pack(tgt: int) -> List[str]:
+                out: List[str] = []
+                cur = ""
+                for s in parts:
+                    s = (s or "").strip()
+                    if not s:
+                        continue
+                    if not cur:
+                        cur = s
+                        continue
+                    if len(cur) + len(s) <= tgt:
+                        cur += s
+                    else:
+                        out.append(cur)
+                        cur = s
+                if cur:
+                    out.append(cur)
+                return out
+
+            packed = pack(target_chars)
+            if max_chunks and max_chunks > 0 and len(packed) > max_chunks:
+                total = len("".join(parts))
+                new_target = max(target_chars, int(total / max_chunks) + 1)
+                packed = pack(new_target)
+            return packed
+
+        async def send_message_single_sentences() -> Tuple[bool, str]:
+            result = await self._execute_backend(backend, clean_text, voice, emotion)
+            if result.success:
+                # 生成更详细的动作记录，帮助 planner 避免重复执行
+                text_preview = clean_text[:80] + "..." if len(clean_text) > 80 else clean_text
+                await self.store_action_info(
+                    action_build_into_prompt=True,
+                    action_prompt_display=f"已用语音回复：{text_preview}",
+                    action_done=True
+                )
+            else:
+                await self._send_error(f"语音合成失败: {result.message}")
+
+            return result.success, result.message
+        async def send_message_with_splited_sentences() -> Tuple[bool, str]:
+        # 分段发送模式：将文本分割成句子，逐句发送语音
+            if len(sentences) > 1:
+                logger.info(f"{self.log_prefix} 分段发送模式：共 {len(sentences)} 句")
+
+                success_count = 0
+                all_sentences_text = []
+
+                for i, sentence in enumerate(sentences):
+                    if not sentence.strip():
+                        continue
+
+                    logger.debug(f"{self.log_prefix} 发送第 {i + 1}/{len(sentences)} 句: {sentence[:30]}...")
+                    result = await self._execute_backend(backend, sentence, voice, emotion)
+
+                    if result.success:
+                        success_count += 1
+                        all_sentences_text.append(sentence)
+                    else:
+                        logger.warning(f"{self.log_prefix} 第 {i + 1} 句发送失败: {result.message}")
+
+                    # 句子之间添加延迟
+                    if i < len(sentences) - 1 and split_delay > 0:
+                        await asyncio.sleep(split_delay)
+
+                # 记录动作信息
+                if success_count > 0:
+                    # 生成更详细的动作记录，帮助 planner 避免重复执行
+                    display_text = "".join(all_sentences_text)
+                    text_preview = display_text[:80] + "..." if len(display_text) > 80 else display_text
+                    await self.store_action_info(
+                        action_build_into_prompt=True,
+                        action_prompt_display=f"已用语音回复（{success_count}段）：{text_preview}",
+                        action_done=True
+                    )
+                    return True, f"成功发送 {success_count}/{len(sentences)} 条语音"
+                else:
+                    await self._send_error("语音合成失败")
+                    return False, "所有语音发送失败"
+            else:
+                # 只有一句，正常发送
+                return await send_message_single_sentences()
+
+        """执行TTS语音合成"""
+        try:
+            raw_text = self.action_data.get("text", "").strip()
+            voice = self.action_data.get("voice", "")
+            reason = self.action_data.get("reason", "")
+            emotion = self.action_data.get("emotion", "")
+
+            use_replyer = self.get_config(ConfigKeys.GENERAL_USE_REPLYER_REWRITE, True)
+
+            # 获取最终文本
+            success, final_text = await self._get_final_text(raw_text, reason, use_replyer)
+            if not success or not final_text:
+                await self._send_error("无法生成语音内容")
+                return False, "文本为空"
+
+            # 概率检查
+            force_trigger = self._check_force_trigger(final_text)
+            if not force_trigger and not self._probability_check(final_text):
+                logger.info(f"{self.log_prefix} 概率检查未通过，使用文字回复")
+                await self.send_text(final_text)
+                text_preview = final_text[:80] + "..." if len(final_text) > 80 else final_text
+                await self.store_action_info(
+                    action_build_into_prompt=True,
+                    action_prompt_display=f"已用文字回复（语音概率未触发）：{text_preview}",
+                    action_done=True
+                )
+                return True, "概率检查未通过，已发送文字回复"
+
+            # 清理文本（移除特殊字符，替换网络用语）
+            # 注意：长度应该由LLM在生成时就遵守，这里只做字符清理
+            clean_text = TTSTextUtils.clean_text(final_text, self.max_text_length)
+            if not clean_text:
+                await self._send_error("文本处理后为空")
+                return False, "文本处理后为空"
+
+            # 如果清理后的文本仍然超过限制，说明LLM未遵守约束
+            if len(clean_text) > self.max_text_length:
+                logger.warning(
+                    f"{self.log_prefix} LLM生成的文本超过长度限制 "
+                    f"({len(clean_text)} > {self.max_text_length}字符)，降级为文字回复"
+                )
+                await self.send_text(clean_text)
+                text_preview = clean_text[:80] + "..." if len(clean_text) > 80 else clean_text
+                await self.store_action_info(
+                    action_build_into_prompt=True,
+                    action_prompt_display=f"已用文字回复（内容过长）：{text_preview}",
+                    action_done=True
+                )
+                return True, "内容超过语音长度限制，已改为文字回复"
+
+            # 获取后端并执行
+            backend = self._get_default_backend()
+            logger.info(f"{self.log_prefix} 使用配置的默认后端: {backend}")
+
+            # 检查是否启用分段发送
+            split_sentences = self.get_config(ConfigKeys.GENERAL_SPLIT_SENTENCES, True)
+            split_delay = self.get_config(ConfigKeys.GENERAL_SPLIT_DELAY, 0.3)
+
+            sentences = None
+
+            # 优先使用智能分割插件的分隔符
+            if '|||SPLIT|||' in clean_text:
+                logger.info("found split marker from smart segmentation plugin")
+                sentences = [s.strip() for s in clean_text.split("|||SPLIT|||") if s.strip()]
+                # If the upstream splitter is too aggressive, pack back into fewer segments.
+                max_segments = int(self.get_config(ConfigKeys.GENERAL_SPLIT_MAX_SEGMENTS, 3) or 3)
+                chunk_chars = int(self.get_config(ConfigKeys.GENERAL_SPLIT_CHUNK_CHARS, 110) or 110)
+                if max_segments and max_segments > 0 and len(sentences) > max_segments:
+                    sentences = _chunk_sentences(sentences, target_chars=chunk_chars, max_chunks=max_segments)
+                return await send_message_with_splited_sentences()
+            elif split_sentences:
+                # 自动分段：短文本不分段；长文本最多分成 N 段，避免刷屏式多段语音。
+                min_total = int(self.get_config(ConfigKeys.GENERAL_SPLIT_MIN_TOTAL_CHARS, 120) or 120)
+                min_sentence = int(self.get_config(ConfigKeys.GENERAL_SPLIT_MIN_SENTENCE_CHARS, 6) or 6)
+                max_segments = int(self.get_config(ConfigKeys.GENERAL_SPLIT_MAX_SEGMENTS, 3) or 3)
+                chunk_chars = int(self.get_config(ConfigKeys.GENERAL_SPLIT_CHUNK_CHARS, 110) or 110)
+
+                if len(clean_text) < min_total:
+                    sentences = [clean_text]
+                else:
+                    sentences = TTSTextUtils.split_sentences(clean_text, min_length=min_sentence)
+                    if max_segments and max_segments > 0:
+                        sentences = _chunk_sentences(sentences, target_chars=chunk_chars, max_chunks=max_segments)
+                return await send_message_with_splited_sentences()
+            else:
+                # 单句发送
+                return await send_message_single_sentences()
+
+        except Exception as e:
+            error_msg = str(e)
+            logger.error(f"{self.log_prefix} TTS语音合成出错: {error_msg}")
+            await self._send_error(f"语音合成出错: {error_msg}")
+            return False, error_msg
+
+
+class UnifiedTTSCommand(BaseCommand, TTSExecutorMixin):
+    """统一TTS Command - 用户手动触发"""
+
+    command_name = "unified_tts_command"
+    command_description = "将文本转换为语音，支持多种后端和音色"
+    command_pattern = r"^/(?:tts|voice|gsv2p|gptsovits|doubao|cosyvoice|comfyui|comfyui_voiceclone|comfyui_customvoice)\s+(?P<text>.+?)(?:\s+-v\s+(?P<voice>\S+))?(?:\s+(?P<backend>ai_voice|gsv2p|gpt_sovits|doubao|cosyvoice|comfyui|comfyui_voiceclone|comfyui_customvoice))?$"
+    command_help = "将文本转换为语音。用法：/tts 你好世界 [-v 音色] [后端]"
+    command_examples = [
+        "/tts 你好，世界！",
+        "/tts 今天天气不错 -v 小新",
+        "/gptsovits 你好世界 -v default",
+        "/cosyvoice 你好世界 -v 四川话",
+        "/tts 试试 -v 温柔妹妹 ai_voice",
+        "/gsv2p 你好世界",
+        "/doubao 你好世界 -v 开心"
+    ]
+    intercept_message = True
+
+    async def _send_help(self):
+        """发送帮助信息"""
+        default_backend = self._get_default_backend()
+
+        help_text = """【TTS语音合成插件帮助】
+
+📝 基本语法：
+/tts <文本> [-v <音色>] [后端]
+
+	🎯 快捷命令：
+	/tts <文本>        使用默认后端
+	/voice <文本>      使用 AI Voice
+	/gsv2p <文本>      使用 GSV2P
+	/gptsovits <文本>  使用 GPT-SoVITS
+	/doubao <文本>     使用 豆包语音
+	/cosyvoice <文本>  使用 CosyVoice
+	/comfyui <文本>    使用 ComfyUI(本地工作流)
+	/comfyui_voiceclone <文本>  使用 ComfyUI VoiceClone
+	/comfyui_customvoice <文本> 使用 ComfyUI CustomVoice
+
+	🔊 可用后端：
+	• ai_voice   - MaiCore内置（仅群聊）
+	• gsv2p      - 云端API，高质量
+	• gpt_sovits - 本地服务，可定制
+	• doubao     - 火山引擎，支持情感
+	• cosyvoice  - 阿里云，支持方言
+	• comfyui            - 本地ComfyUI工作流(自动按 style.mode 选择)
+	• comfyui_voiceclone - 本地ComfyUI工作流(仅 VoiceClone)
+	• comfyui_customvoice - 本地ComfyUI工作流(仅 CustomVoice)
+
+🎭 音色/情感参数（-v）：
+• AI Voice: 小新、温柔妹妹、霸道总裁、妲己 等22种
+• GSV2P: 原神-中文-派蒙_ZH 等（见API文档）
+• 豆包: 开心、生气、伤心、撒娇、严肃 等
+• CosyVoice: 广东话、四川话、东北话、开心、慢速 等
+
+📌 示例：
+/tts 你好世界
+/tts 今天真开心 -v 开心
+/gptsovits 这是本地语音合成
+/doubao 我生气了 -v 生气
+/cosyvoice 你好 -v 广东话
+/voice 测试一下 -v 温柔妹妹
+
+⚙️ 当前默认后端：""" + default_backend
+
+        await self.send_text(help_text)
+
+    def _determine_backend(self, user_backend: str) -> Tuple[str, str]:
+        """
+        确定使用的后端
+
+        Returns:
+            (backend_name, source_description)
+        """
+        # 1. 检查命令前缀
+        raw_text = self.message.raw_message if self.message.raw_message else self.message.processed_plain_text
+        if raw_text:
+            # 命令前缀到后端的映射
+            prefix_backend_map = {
+                "/gsv2p": "gsv2p",
+                "/gptsovits": "gpt_sovits",
+                "/doubao": "doubao",
+                "/cosyvoice": "cosyvoice",
+                "/voice": "ai_voice",
+                "/comfyui": "comfyui",
+                "/comfyui_voiceclone": "comfyui_voiceclone",
+                "/comfyui_customvoice": "comfyui_customvoice",
+            }
+            for prefix, backend in prefix_backend_map.items():
+                if raw_text.startswith(prefix):
+                    return backend, f"命令前缀 {prefix}"
+
+        # 2. 检查命令参数
+        if user_backend and user_backend in VALID_BACKENDS:
+            return user_backend, f"命令参数 {user_backend}"
+
+        # 3. 使用配置文件默认值
+        return self._get_default_backend(), "配置文件"
+
+    async def execute(self) -> Tuple[bool, str, bool]:
+        """执行TTS命令"""
+        try:
+            text = self.matched_groups.get("text", "").strip()
+            voice = self.matched_groups.get("voice", "")
+            user_backend = self.matched_groups.get("backend", "")
+
+            # 处理帮助命令
+            if text.lower() == "help":
+                await self._send_help()
+                return True, "显示帮助信息", True
+
+            if not text:
+                await self._send_error("请输入要转换为语音的文本内容")
+                return False, "缺少文本内容", True
+
+            # 确定后端
+            backend, backend_source = self._determine_backend(user_backend)
+
+            # 清理文本
+            max_length = self.get_config(ConfigKeys.GENERAL_MAX_TEXT_LENGTH, 500)
+            clean_text = TTSTextUtils.clean_text(text, max_length)
+
+            if not clean_text:
+                await self._send_error("文本处理后为空")
+                return False, "文本处理后为空", True
+
+            # 检查长度限制
+            if len(clean_text) > max_length:
+                await self.send_text(
+                    f"文本过长（{len(clean_text)}字符），"
+                    f"超过语音合成限制（{max_length}字符），"
+                    f"已改为文字发送。\n\n{clean_text}"
+                )
+                return True, "文本过长，已改为文字发送", True
+
+            logger.info(f"{self.log_prefix} 执行TTS命令 (后端: {backend} [来源: {backend_source}], 音色: {voice})")
+
+            # 执行后端
+            # 对于 CosyVoice 和豆包，voice 参数实际上是情感/方言
+            if backend in ["cosyvoice", "doubao"]:
+                result = await self._execute_backend(backend, clean_text, voice="", emotion=voice)
+            else:
+                result = await self._execute_backend(backend, clean_text, voice)
+
+            if not result.success:
+                await self._send_error(f"语音合成失败: {result.message}")
+
+            return result.success, result.message, True
+
+        except Exception as e:
+            logger.error(f"{self.log_prefix} TTS命令执行出错: {e}")
+            await self._send_error(f"语音合成出错: {e}")
+            return False, f"执行出错: {e}", True
+
+
+class TTSInstructCommand(BaseCommand):
+    """生成 CustomVoice instruct（调试/预览用）"""
+
+    command_name = "tts_instruct_command"
+    command_description = "根据待朗读文本生成 CustomVoice 的 instruct（情绪/语速/停顿）"
+    command_pattern = r"^/tts_instruct\\s+(?P<text>.+?)$"
+    command_help = "用法：/tts_instruct <文本>"
+    command_examples = [
+        "/tts_instruct 早上好，今天也要加油。",
+        "/tts_instruct えっ？本当にそうなの？",
+    ]
+    intercept_message = True
+
+    async def execute(self) -> Tuple[bool, str, int]:
+        try:
+            text = (self.matched_groups.get("text") or "").strip()
+            if not text:
+                await self.send_text("请输入要生成 instruct 的文本")
+                return False, "缺少文本", 2
+
+            # Use the same logic as ComfyUI backend auto_instruct.
+            from .backends.comfyui import ComfyUIBackend
+            from .utils.text import TTSTextUtils
+
+            detected = TTSTextUtils.detect_language(text)
+            chat_stream = getattr(self.message, "chat_stream", None)
+            chat_id = getattr(chat_stream, "stream_id", None) if chat_stream else None
+
+            backend = ComfyUIBackend(self.get_config, log_prefix=self.log_prefix)
+            instruct = await backend._infer_instruct(
+                text=text,
+                detected_lang=detected,
+                chat_stream=chat_stream,
+                chat_id=chat_id,
+                style_name="__command__",
+            )
+
+            if not instruct:
+                await self.send_text("instruct 生成失败（可能未启用 comfyui.auto_instruct_enabled 或 LLM 不可用）")
+                return False, "instruct 生成失败", 2
+
+            await self.send_text(instruct)
+            return True, "instruct 已生成", 2
+        except Exception as e:
+            await self.send_text(f"instruct 生成异常: {e}")
+            return False, str(e), 2
+
+
+@register_plugin
+class UnifiedTTSPlugin(BasePlugin):
+    """统一TTS语音合成插件 - 支持多后端的文本转语音插件"""
+
+    plugin_name = "tts_voice_plugin"
+    plugin_description = "统一TTS语音合成插件，支持AI Voice、GSV2P、GPT-SoVITS、豆包语音多种后端"
+    plugin_version = "3.2.3"
+    plugin_author = "靓仔"
+    enable_plugin = True
+    config_file_name = "config.toml"
+    dependencies = []
+    python_dependencies = ["aiohttp"]
+
+    config_section_descriptions = {
+        "plugin": "插件基本配置",
+        "general": "通用设置",
+        "components": "组件启用控制",
+        "probability": "概率控制配置",
+        "ai_voice": "AI Voice后端配置",
+        "gsv2p": "GSV2P后端配置",
+        "gpt_sovits": "GPT-SoVITS后端配置",
+        "doubao": "豆包语音后端配置",
+        "cosyvoice": "CosyVoice后端配置",
+        "comfyui": "ComfyUI工作流API后端配置"
+    }
+
+    config_schema = {
+        "plugin": {
+            "enabled": ConfigField(type=bool, default=True, description="是否启用插件"),
+            "config_version": ConfigField(type=str, default="3.2.3", description="配置文件版本")
+        },
+        "general": {
+            "default_backend": ConfigField(
+                type=str, default="cosyvoice",
+                description="默认TTS后端 (ai_voice/gsv2p/gpt_sovits/doubao/cosyvoice/comfyui/comfyui_voiceclone/comfyui_customvoice)"
+            ),
+            "timeout": ConfigField(type=int, default=60, description="请求超时时间（秒）"),
+            "max_text_length": ConfigField(
+                type=int, default=200,
+                description="最大文本长度（该限制会在调用LLM时注入到prompt中，让LLM直接生成符合长度的回复，而不是被动截断）"
+            ),
+            "use_replyer_rewrite": ConfigField(
+                type=bool, default=True,
+                description="是否使用replyer润色语音内容"
+            ),
+            "audio_output_dir": ConfigField(
+                type=str, default="",
+                description="音频文件输出目录（支持相对路径和绝对路径，留空使用项目根目录）"
+            ),
+            "use_base64_audio": ConfigField(
+                type=bool, default=True,
+                description="是否使用base64编码发送音频（备选方案）"
+            ),
+            "split_sentences": ConfigField(
+                type=bool, default=True,
+                description="是否分段发送语音（每句话单独发送一条语音，避免长语音播放问题）"
+            ),
+            "split_delay": ConfigField(
+                type=float, default=0.3,
+                description="分段发送时每条语音之间的延迟（秒）"
+            ),
+            "split_min_total_chars": ConfigField(
+                type=int, default=120,
+                description="自动分段启用阈值：文本长度小于该值时不分段（避免短句被切成多段）",
+            ),
+            "split_min_sentence_chars": ConfigField(
+                type=int, default=6,
+                description="句子最小长度：过短片段会合并到前一句（用于减少碎片段）",
+            ),
+            "split_max_segments": ConfigField(
+                type=int, default=3,
+                description="自动分段最大段数（避免刷屏式多段语音）。0 表示不限制。",
+            ),
+            "split_chunk_chars": ConfigField(
+                type=int, default=110,
+                description="自动分段打包目标长度（字符）。用于把多句合并成更少段。",
+            ),
+            "send_error_messages": ConfigField(
+                type=bool, default=True,
+                description="是否发送错误提示消息（关闭后语音合成失败时不会发送错误信息给用户）"
+            )
+        },
+        "components": {
+            "action_enabled": ConfigField(type=bool, default=True, description="是否启用Action组件"),
+            "command_enabled": ConfigField(type=bool, default=True, description="是否启用Command组件"),
+            "instruct_command_enabled": ConfigField(type=bool, default=True, description="是否启用instruct调试命令组件(/tts_instruct)")
+        },
+        "probability": {
+            "enabled": ConfigField(type=bool, default=False, description="是否启用概率控制"),
+            "base_probability": ConfigField(type=float, default=1.0, description="基础触发概率"),
+            "keyword_force_trigger": ConfigField(type=bool, default=True, description="关键词强制触发"),
+            "force_keywords": ConfigField(
+                type=list,
+                default=["一定要用语音", "必须语音", "语音回复我", "务必用语音"],
+                description="强制触发关键词"
+            )
+        },
+        "ai_voice": {
+            "default_character": ConfigField(
+                type=str,
+                default="邻家小妹",
+                description="默认音色（可选：小新、猴哥、四郎、东北老妹儿、广西大表哥、妲己、霸道总裁、酥心御姐、说书先生、憨憨小弟、憨厚老哥、吕布、元气少女、文艺少女、磁性大叔、邻家小妹、低沉男声、傲娇少女、爹系男友、暖心姐姐、温柔妹妹、书香少女）"
+            )
+        },
+        "gsv2p": {
+            "api_url": ConfigField(
+                type=str, default="https://gsv2p.acgnai.top/v1/audio/speech",
+                description="GSV2P API地址"
+            ),
+            "api_token": ConfigField(type=str, default="", description="API认证Token"),
+            "default_voice": ConfigField(type=str, default="原神-中文-派蒙_ZH", description="默认音色"),
+            "timeout": ConfigField(type=int, default=120, description="API请求超时（秒）"),
+            "model": ConfigField(type=str, default="tts-v4", description="TTS模型"),
+            "response_format": ConfigField(type=str, default="wav", description="音频格式"),
+            "speed": ConfigField(type=float, default=1.0, description="语音速度")
+        },
+        "gpt_sovits": {
+            "server": ConfigField(
+                type=str, default="http://127.0.0.1:9880",
+                description="GPT-SoVITS服务地址"
+            ),
+            "styles": ConfigField(
+                type=list,
+                default=[
+                    {
+                        "name": "default",
+                        "refer_wav": "",
+                        "prompt_text": "",
+                        "prompt_language": "zh",
+                        "gpt_weights": "",
+                        "sovits_weights": ""
+                    }
+                ],
+                description="语音风格配置",
+                item_type="object",
+                item_fields={
+                    "name": {"type": "string", "label": "风格名称", "required": True},
+                    "refer_wav": {"type": "string", "label": "参考音频路径", "required": True},
+                    "prompt_text": {"type": "string", "label": "参考文本", "required": True},
+                    "prompt_language": {"type": "string", "label": "参考语言", "default": "zh"},
+                    "gpt_weights": {"type": "string", "label": "GPT模型权重路径（可选）", "required": False},
+                    "sovits_weights": {"type": "string", "label": "SoVITS模型权重路径（可选）", "required": False}
+                }
+            )
+        },
+        "doubao": {
+            "api_url": ConfigField(
+                type=str,
+                default="https://openspeech.bytedance.com/api/v3/tts/unidirectional",
+                description="豆包语音API地址"
+            ),
+            "app_id": ConfigField(type=str, default="", description="豆包APP ID"),
+            "access_key": ConfigField(type=str, default="", description="豆包Access Key"),
+            "resource_id": ConfigField(type=str, default="seed-tts-2.0", description="豆包Resource ID"),
+            "default_voice": ConfigField(
+                type=str, default="zh_female_vv_uranus_bigtts",
+                description="默认音色"
+            ),
+            "timeout": ConfigField(type=int, default=60, description="API请求超时（秒）"),
+            "audio_format": ConfigField(type=str, default="wav", description="音频格式"),
+            "sample_rate": ConfigField(type=int, default=24000, description="采样率"),
+            "bitrate": ConfigField(type=int, default=128000, description="比特率"),
+            "speed": ConfigField(type=float, default=None, description="语音速度（可选）"),
+            "volume": ConfigField(type=float, default=None, description="音量（可选）"),
+            "context_texts": ConfigField(
+                type=list, default=None,
+                description="上下文辅助文本（可选，仅豆包2.0模型）"
+            )
+        },
+        "cosyvoice": {
+            "gradio_url": ConfigField(
+                type=str,
+                default="https://funaudiollm-fun-cosyvoice3-0-5b.ms.show/",
+                description="Gradio API地址"
+            ),
+            "default_mode": ConfigField(
+                type=str,
+                default="3s极速复刻",
+                description="推理模式（3s极速复刻/自然语言控制）"
+            ),
+            "default_instruct": ConfigField(
+                type=str,
+                default="You are a helpful assistant. 请用广东话表达。<|endofprompt|>",
+                description="默认指令（用于自然语言控制模式）"
+            ),
+            "reference_audio": ConfigField(
+                type=str,
+                default="",
+                description="参考音频路径（用于3s极速复刻模式）"
+            ),
+            "prompt_text": ConfigField(
+                type=str,
+                default="",
+                description="提示文本（用于3s极速复刻模式）"
+            ),
+            "timeout": ConfigField(type=int, default=300, description="API请求超时（秒）"),
+            "audio_format": ConfigField(type=str, default="wav", description="音频格式")
+        },
+	        "comfyui": {
+            "server": ConfigField(
+                type=str,
+                default="http://127.0.0.1:8188",
+                description="ComfyUI 服务地址（示例: http://127.0.0.1:8188）",
+            ),
+            "input_dir": ConfigField(
+                type=str,
+                default="/Users/xenon/Downloads/seiun_tts/ComfyUI/ComfyUI/input",
+                description="ComfyUI input 目录（用于放参考音频，LoadAudio 会从这里读）",
+            ),
+            "timeout": ConfigField(type=int, default=120, description="ComfyUI 请求超时（秒）"),
+            "audio_quality": ConfigField(
+                type=str,
+                default="128k",
+                description="输出 MP3 质量（SaveAudioMP3 quality: V0/128k/320k）",
+            ),
+            "mlx_python": ConfigField(
+                type=str,
+                default="/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/.venv/bin/python",
+                description="MLX Qwen3-TTS venv python 路径（用于 ComfyUI-MLX 节点子进程）",
+            ),
+            "mlx_cli": ConfigField(
+                type=str,
+                default="/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/mlx_voice_clone_cli.py",
+                description="mlx_voice_clone_cli.py 路径",
+            ),
+	            "default_style": ConfigField(type=str, default="default", description="默认风格名称"),
+	            "voiceclone_default_style": ConfigField(
+	                type=str,
+	                default="",
+	                description="VoiceClone 专用默认风格名称（用于 comfyui_voiceclone 后端；留空则回退到 default_style）",
+	            ),
+	            "customvoice_default_style": ConfigField(
+	                type=str,
+	                default="",
+	                description="CustomVoice 专用默认风格名称（用于 comfyui_customvoice 后端；留空则回退到 default_style）",
+	            ),
+	            "auto_instruct_enabled": ConfigField(
+	                type=bool,
+	                default=False,
+	                description="是否启用 CustomVoice instruct 自动推断（使用 MaiBot 的 LLM 接口）",
+	            ),
+            "auto_instruct_max_chars": ConfigField(
+                type=int,
+                default=120,
+                description="自动推断 instruct 的最大长度（字符）。建议 80-160，太短会导致情绪/表演提示被截断。",
+            ),
+            "auto_instruct_prompt": ConfigField(
+                type=str,
+                default="",
+                description="自定义 instruct 推断 prompt（留空使用内置模板）",
+            ),
+            "auto_instruct_base_tone": ConfigField(
+                type=str,
+                default="",
+                description="自动推断 instruct 时固定附加的基调描述（会作为 `基调=...;` 前缀插入；会自动清洗为单行，且不会包含 `;`/`=`）",
+            ),
+            "pause_linebreak": ConfigField(type=float, default=0.0, description="换行停顿（秒）"),
+            "period_pause": ConfigField(type=float, default=0.0, description="句号停顿（秒）"),
+            "comma_pause": ConfigField(type=float, default=0.0, description="逗号停顿（秒）"),
+            "question_pause": ConfigField(type=float, default=0.0, description="问号停顿（秒）"),
+            "hyphen_pause": ConfigField(type=float, default=0.0, description="连字符停顿（秒）"),
+            "styles": ConfigField(
+                type=list,
+                default=[
+                    {
+                        "name": "default",
+                        "refer_wav": "",
+                        "prompt_text": "",
+                        "language": "",
+                        "model_choice": "1.7B",
+                        "precision": "bf16",
+                        "seed": 0,
+                        "max_new_tokens": 2048,
+                        "top_p": 0.8,
+                        "top_k": 20,
+                        "temperature": 1.0,
+                        "repetition_penalty": 1.05,
+                    }
+                ],
+                description="ComfyUI VoiceClone 风格配置（参考音频+逐字稿）",
+                item_type="object",
+                item_fields={
+                    "name": {"type": "string", "label": "风格名称", "required": True},
+                    "mode": {"type": "string", "label": "模式(voice_clone/custom_voice)", "required": False},
+                    "refer_wav": {"type": "string", "label": "参考音频路径", "required": True},
+                    "prompt_text": {"type": "string", "label": "参考文本(逐字稿)", "required": True},
+                    "language": {"type": "string", "label": "语言(可选: Auto/Chinese/English/...) ", "required": False},
+                    "model_choice": {"type": "string", "label": "模型(0.6B/1.7B)", "required": False},
+                    "precision": {"type": "string", "label": "精度(bf16/fp32)", "required": False},
+                    "model_path": {"type": "string", "label": "CustomVoice模型路径", "required": False},
+                    "speaker": {"type": "string", "label": "CustomVoice说话人", "required": False},
+                    "instruct": {"type": "string", "label": "CustomVoice指令(或__AUTO__)", "required": False},
+                    "auto_instruct": {"type": "boolean", "label": "按style启用auto_instruct", "required": False},
+                    "speed": {"type": "number", "label": "speed", "required": False},
+                    "seed": {"type": "number", "label": "seed", "required": False},
+                    "max_new_tokens": {"type": "number", "label": "max_new_tokens", "required": False},
+                    "top_p": {"type": "number", "label": "top_p", "required": False},
+                    "top_k": {"type": "number", "label": "top_k", "required": False},
+                    "temperature": {"type": "number", "label": "temperature", "required": False},
+                    "repetition_penalty": {"type": "number", "label": "repetition_penalty", "required": False},
+                },
+            ),
+        }
+    }
+
+    def get_plugin_components(self) -> List[Tuple[ComponentInfo, Type]]:
+        """返回插件组件列表"""
+        components = []
+
+        try:
+            action_enabled = self.get_config(ConfigKeys.COMPONENTS_ACTION_ENABLED, True)
+            command_enabled = self.get_config(ConfigKeys.COMPONENTS_COMMAND_ENABLED, True)
+            instruct_enabled = self.get_config(ConfigKeys.COMPONENTS_INSTRUCT_COMMAND_ENABLED, True)
+        except AttributeError:
+            action_enabled = True
+            command_enabled = True
+            instruct_enabled = True
+
+        if action_enabled:
+            components.append((UnifiedTTSAction.get_action_info(), UnifiedTTSAction))
+
+        if command_enabled:
+            components.append((UnifiedTTSCommand.get_command_info(), UnifiedTTSCommand))
+
+        if instruct_enabled:
+            components.append((TTSInstructCommand.get_command_info(), TTSInstructCommand))
+
+        return components
diff --git a/test.wav b/test.wav
new file mode 100644
index 00000000..37550701
Binary files /dev/null and b/test.wav differ
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 00000000..1c0e5cd4
--- /dev/null
+++ b/utils/__init__.py
@@ -0,0 +1,12 @@
+"""
+TTS工具模块
+"""
+
+import sys
+sys.dont_write_bytecode = True
+
+from .text import TTSTextUtils
+from .session import TTSSessionManager
+from .file import TTSFileManager
+
+__all__ = ["TTSTextUtils", "TTSSessionManager", "TTSFileManager"]
diff --git a/utils/file.py b/utils/file.py
new file mode 100644
index 00000000..c56469a6
--- /dev/null
+++ b/utils/file.py
@@ -0,0 +1,280 @@
+"""
+文件操作工具类
+提供异步文件操作、临时文件管理等功能
+"""
+
+import os
+import uuid
+import tempfile
+import asyncio
+import base64
+from typing import Optional
+from src.common.logger import get_logger
+
+logger = get_logger("tts_file_manager")
+
+# 音频数据最小有效大小（字节）
+MIN_AUDIO_SIZE = 100
+
+
+class TTSFileManager:
+    """
+    TTS文件管理器
+
+    提供:
+    - 临时文件创建（避免并发冲突）
+    - 异步文件写入
+    - 自动清理
+    - 相对路径和绝对路径支持
+    """
+
+    # 临时文件目录（兼容旧代码）
+    _temp_dir: Optional[str] = None
+
+    # 项目根目录（用于解析相对路径）
+    _project_root: Optional[str] = None
+
+    @classmethod
+    def set_project_root(cls, root_path: str):
+        """设置项目根目录"""
+        if os.path.isdir(root_path):
+            cls._project_root = root_path
+            logger.debug(f"设置项目根目录: {root_path}")
+        else:
+            logger.warning(f"项目根目录不存在: {root_path}")
+
+    @classmethod
+    def get_project_root(cls) -> str:
+        """获取项目根目录"""
+        if cls._project_root is None:
+            # 尝试从当前文件位置推断项目根目录
+            current_file = os.path.abspath(__file__)
+            # 假设结构是: project_root/plugins/tts_voice_plugin/utils/file.py
+            cls._project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_file))))
+            logger.debug(f"自动推断项目根目录: {cls._project_root}")
+        return cls._project_root
+
+    @classmethod
+    def resolve_path(cls, path: str) -> str:
+        """
+        解析路径（支持相对路径和绝对路径）
+
+        Args:
+            path: 路径字符串
+
+        Returns:
+            解析后的绝对路径
+        """
+        if os.path.isabs(path):
+            # 已经是绝对路径
+            return path
+        else:
+            # 相对路径，相对于项目根目录
+            return os.path.join(cls.get_project_root(), path)
+
+    @classmethod
+    def ensure_dir(cls, dir_path: str) -> bool:
+        """
+        确保目录存在，不存在则创建
+
+        Args:
+            dir_path: 目录路径
+
+        Returns:
+            是否成功
+        """
+        try:
+            os.makedirs(dir_path, exist_ok=True)
+            return True
+        except Exception as e:
+            logger.error(f"创建目录失败: {dir_path}, 错误: {e}")
+            return False
+
+    @classmethod
+    def get_temp_dir(cls) -> str:
+        """
+        获取临时文件目录（已废弃，保留兼容性）
+
+        Returns:
+            临时目录路径
+        """
+        if cls._temp_dir is None:
+            cls._temp_dir = tempfile.gettempdir()
+        return cls._temp_dir
+
+    @classmethod
+    def set_temp_dir(cls, path: str):
+        """
+        设置临时文件目录（已废弃，保留兼容性）
+
+        Args:
+            path: 目录路径
+        """
+        if os.path.isdir(path):
+            cls._temp_dir = path
+        else:
+            raise ValueError(f"目录不存在: {path}")
+
+    @classmethod
+    def generate_temp_path(cls, prefix: str = "tts", suffix: str = ".mp3", output_dir: str = "") -> str:
+        """
+        生成唯一的临时文件路径
+
+        Args:
+            prefix: 文件名前缀
+            suffix: 文件扩展名
+            output_dir: 输出目录（支持相对路径和绝对路径，留空使用项目根目录）
+
+        Returns:
+            临时文件的绝对路径
+        """
+        # 确定输出目录
+        if not output_dir:
+            # 默认使用项目根目录
+            resolved_dir = cls.get_project_root()
+        else:
+            # 解析用户配置的路径
+            resolved_dir = cls.resolve_path(output_dir)
+            # 确保目录存在
+            if not cls.ensure_dir(resolved_dir):
+                # 如果创建失败，降级到项目根目录
+                logger.warning(f"无法创建输出目录 {resolved_dir}，使用项目根目录")
+                resolved_dir = cls.get_project_root()
+
+        # 生成唯一文件名
+        unique_id = uuid.uuid4().hex[:12]
+        filename = f"{prefix}_{unique_id}{suffix}"
+        return os.path.join(resolved_dir, filename)
+
+    @classmethod
+    async def write_audio_async(cls, path: str, data: bytes) -> bool:
+        """
+        异步写入音频数据到文件
+
+        Args:
+            path: 文件路径
+            data: 音频二进制数据
+
+        Returns:
+            是否写入成功
+        """
+        try:
+            # 使用线程池执行同步文件写入，避免阻塞事件循环
+            loop = asyncio.get_event_loop()
+            await loop.run_in_executor(None, cls._write_file_sync, path, data)
+            logger.debug(f"音频文件写入成功: {path} ({len(data)} bytes)")
+            return True
+        except IOError as e:
+            logger.error(f"写入音频文件失败: {path}, 错误: {e}")
+            return False
+        except Exception as e:
+            logger.error(f"写入音频文件时发生未知错误: {path}, 错误: {e}")
+            return False
+
+    @staticmethod
+    def _write_file_sync(path: str, data: bytes):
+        """同步写入文件（内部方法）"""
+        with open(path, "wb") as f:
+            f.write(data)
+
+    @classmethod
+    def write_audio_sync(cls, path: str, data: bytes) -> bool:
+        """
+        同步写入音频数据到文件
+
+        Args:
+            path: 文件路径
+            data: 音频二进制数据
+
+        Returns:
+            是否写入成功
+        """
+        try:
+            cls._write_file_sync(path, data)
+            logger.debug(f"音频文件写入成功: {path} ({len(data)} bytes)")
+            return True
+        except IOError as e:
+            logger.error(f"写入音频文件失败: {path}, 错误: {e}")
+            return False
+        except Exception as e:
+            logger.error(f"写入音频文件时发生未知错误: {path}, 错误: {e}")
+            return False
+
+    @classmethod
+    def cleanup_file(cls, path: str, silent: bool = True) -> bool:
+        """
+        清理临时文件
+
+        Args:
+            path: 文件路径
+            silent: 是否静默处理错误
+
+        Returns:
+            是否清理成功
+        """
+        try:
+            if path and os.path.exists(path):
+                os.remove(path)
+                logger.debug(f"临时文件已清理: {path}")
+                return True
+            return False
+        except Exception as e:
+            if not silent:
+                logger.warning(f"清理临时文件失败: {path}, 错误: {e}")
+            return False
+
+    @classmethod
+    async def cleanup_file_async(cls, path: str, delay: float = 0) -> bool:
+        """
+        异步清理临时文件（可延迟）
+
+        Args:
+            path: 文件路径
+            delay: 延迟秒数
+
+        Returns:
+            是否清理成功
+        """
+        if delay > 0:
+            await asyncio.sleep(delay)
+        loop = asyncio.get_event_loop()
+        return await loop.run_in_executor(None, cls.cleanup_file, path, True)
+
+    @classmethod
+    def validate_audio_data(cls, data: bytes, min_size: int = None) -> tuple:
+        """
+        验证音频数据有效性
+
+        Args:
+            data: 音频二进制数据
+            min_size: 最小有效大小
+
+        Returns:
+            (is_valid, error_message)
+        """
+        if data is None:
+            return False, "音频数据为空"
+
+        min_size = min_size or MIN_AUDIO_SIZE
+
+        if len(data) < min_size:
+            return False, f"音频数据过小({len(data)}字节 < {min_size}字节)"
+
+        return True, ""
+
+    @classmethod
+    def audio_to_base64(cls, data: bytes) -> str:
+        """
+        将音频数据转换为base64字符串
+
+        Args:
+            data: 音频二进制数据
+
+        Returns:
+            base64编码的字符串
+        """
+        try:
+            return base64.b64encode(data).decode('utf-8')
+        except Exception as e:
+            logger.error(f"音频数据转base64失败: {e}")
+            return ""
diff --git a/utils/session.py b/utils/session.py
new file mode 100644
index 00000000..8535b04c
--- /dev/null
+++ b/utils/session.py
@@ -0,0 +1,186 @@
+"""
+HTTP Session 管理器
+提供连接池复用，避免每次请求创建新连接
+"""
+
+import asyncio
+import aiohttp
+from typing import Optional, Dict, Any
+from contextlib import asynccontextmanager
+from src.common.logger import get_logger
+
+logger = get_logger("tts_session_manager")
+
+
+class TTSSessionManager:
+    """
+    TTS HTTP Session 管理器
+
+    提供:
+    - 连接池复用
+    - 自动超时管理
+    - 优雅关闭
+    """
+
+    _instance: Optional["TTSSessionManager"] = None
+    _lock = asyncio.Lock()
+
+    def __init__(self):
+        self._sessions: Dict[str, aiohttp.ClientSession] = {}
+        self._default_timeout = 60
+
+    @classmethod
+    async def get_instance(cls) -> "TTSSessionManager":
+        """获取单例实例"""
+        if cls._instance is None:
+            async with cls._lock:
+                if cls._instance is None:
+                    cls._instance = cls()
+        return cls._instance
+
+    async def get_session(
+        self,
+        backend_name: str = "default",
+        timeout: int = None
+    ) -> aiohttp.ClientSession:
+        """
+        获取或创建 HTTP Session
+
+        Args:
+            backend_name: 后端名称，用于区分不同的session
+            timeout: 超时时间（秒）
+
+        Returns:
+            aiohttp.ClientSession 实例
+        """
+        if backend_name not in self._sessions or self._sessions[backend_name].closed:
+            timeout_val = timeout or self._default_timeout
+            connector = aiohttp.TCPConnector(
+                limit=10,  # 每个主机最大连接数
+                limit_per_host=5,
+                ttl_dns_cache=300,  # DNS缓存5分钟
+                force_close=True,  # 禁用连接复用，修复GSV2P等API的兼容性问题
+            )
+            self._sessions[backend_name] = aiohttp.ClientSession(
+                connector=connector,
+                timeout=aiohttp.ClientTimeout(total=timeout_val)
+            )
+            logger.debug(f"创建新的HTTP Session: {backend_name}")
+
+        return self._sessions[backend_name]
+
+    async def close_session(self, backend_name: str = None):
+        """
+        关闭指定或所有 Session
+
+        Args:
+            backend_name: 后端名称，为None时关闭所有
+        """
+        if backend_name:
+            if backend_name in self._sessions:
+                await self._sessions[backend_name].close()
+                del self._sessions[backend_name]
+                logger.debug(f"关闭HTTP Session: {backend_name}")
+        else:
+            for name, session in self._sessions.items():
+                if not session.closed:
+                    await session.close()
+                    logger.debug(f"关闭HTTP Session: {name}")
+            self._sessions.clear()
+
+    @asynccontextmanager
+    async def post(
+        self,
+        url: str,
+        json: Dict[str, Any] = None,
+        headers: Dict[str, str] = None,
+        data: Any = None,
+        backend_name: str = "default",
+        timeout: int = None
+    ):
+        """
+        发送POST请求（异步上下文管理器）
+
+        Args:
+            url: 请求URL
+            json: JSON请求体
+            headers: 请求头
+            data: 表单数据
+            backend_name: 后端名称
+            timeout: 超时时间
+
+        Yields:
+            aiohttp.ClientResponse
+
+        Usage:
+            async with session_manager.post(url, json=data) as response:
+                ...
+        """
+        session = await self.get_session(backend_name, timeout)
+
+        # 如果指定了不同的超时时间，创建新的超时对象
+        req_timeout = None
+        if timeout:
+            req_timeout = aiohttp.ClientTimeout(total=timeout)
+
+        response = await session.post(
+            url,
+            json=json,
+            headers=headers,
+            data=data,
+            timeout=req_timeout
+        )
+        try:
+            yield response
+        finally:
+            response.release()
+
+    @asynccontextmanager
+    async def get(
+        self,
+        url: str,
+        headers: Dict[str, str] = None,
+        params: Dict[str, Any] = None,
+        backend_name: str = "default",
+        timeout: int = None
+    ):
+        """
+        发送GET请求（异步上下文管理器）
+
+        Args:
+            url: 请求URL
+            headers: 请求头
+            params: URL参数
+            backend_name: 后端名称
+            timeout: 超时时间
+
+        Yields:
+            aiohttp.ClientResponse
+
+        Usage:
+            async with session_manager.get(url) as response:
+                ...
+        """
+        session = await self.get_session(backend_name, timeout)
+
+        # 如果指定了不同的超时时间，创建新的超时对象
+        req_timeout = None
+        if timeout:
+            req_timeout = aiohttp.ClientTimeout(total=timeout)
+
+        response = await session.get(
+            url,
+            headers=headers,
+            params=params,
+            timeout=req_timeout
+        )
+        try:
+            yield response
+        finally:
+            response.release()
+
+    async def __aenter__(self):
+        return self
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self.close_session()
diff --git a/utils/text.py b/utils/text.py
new file mode 100644
index 00000000..93524c08
--- /dev/null
+++ b/utils/text.py
@@ -0,0 +1,181 @@
+"""
+文本处理工具类
+"""
+
+import re
+from typing import Optional, List
+
+
+class TTSTextUtils:
+    """TTS文本处理工具类"""
+
+    # 网络用语替换映射
+    NETWORK_SLANG_MAP = {
+        'www': '哈哈哈',
+        'hhh': '哈哈',
+        '233': '哈哈',
+        '666': '厉害',
+        '88': '拜拜',
+        '...': '。',
+        '……': '。'
+    }
+
+    # 需要移除的特殊字符正则
+    SPECIAL_CHAR_PATTERN = re.compile(
+        r'[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ffa-zA-Z0-9\s，。！？、；：（）【】"\'.,!?;:()\[\]`-]'
+    )
+
+    # 语言检测正则
+    CHINESE_PATTERN = re.compile(r'[\u4e00-\u9fff]')
+    ENGLISH_PATTERN = re.compile(r'[a-zA-Z]')
+    JAPANESE_PATTERN = re.compile(r'[\u3040-\u309f\u30a0-\u30ff]')
+
+    @classmethod
+    def clean_text(cls, text: str, max_length: int = 500) -> str:
+        """
+        清理文本，移除特殊字符，替换网络用语
+
+        Args:
+            text: 原始文本
+            max_length: 最大长度限制（此参数已不用于硬截断，仅用于参考）
+
+        Returns:
+            清理后的文本（不会硬截断，保留完整内容以便上层决策）
+        """
+        if not text:
+            return ""
+
+        # 注释掉文本清理功能，保留原始格式
+        # 移除不支持的特殊字符
+        # text = cls.SPECIAL_CHAR_PATTERN.sub('', text)
+
+        # 替换常见网络用语
+        # for old, new in cls.NETWORK_SLANG_MAP.items():
+        #     text = text.replace(old, new)
+
+        return text.strip()
+
+    @classmethod
+    def detect_language(cls, text: str) -> str:
+        """
+        检测文本语言
+
+        Args:
+            text: 待检测文本
+
+        Returns:
+            语言代码 (zh/ja/en)
+        """
+        if not text:
+            return "zh"
+
+        chinese_chars = len(cls.CHINESE_PATTERN.findall(text))
+        english_chars = len(cls.ENGLISH_PATTERN.findall(text))
+        japanese_chars = len(cls.JAPANESE_PATTERN.findall(text))
+        total_chars = chinese_chars + english_chars + japanese_chars
+
+        if total_chars == 0:
+            return "zh"
+
+        chinese_ratio = chinese_chars / total_chars
+        japanese_ratio = japanese_chars / total_chars
+        english_ratio = english_chars / total_chars
+
+        if chinese_ratio > 0.3:
+            return "zh"
+        elif japanese_ratio > 0.3:
+            return "ja"
+        elif english_ratio > 0.8:
+            return "en"
+        else:
+            return "zh"
+
+    @classmethod
+    def resolve_voice_alias(
+        cls,
+        voice: Optional[str],
+        alias_map: dict,
+        default: str,
+        prefix: str = ""
+    ) -> str:
+        """
+        解析音色别名
+
+        Args:
+            voice: 用户指定的音色
+            alias_map: 别名映射表
+            default: 默认音色
+            prefix: 内部音色ID前缀（如 "lucy-voice-"）
+
+        Returns:
+            解析后的音色ID
+        """
+        if not voice:
+            voice = default
+
+        # 如果已经是内部ID格式，直接返回
+        if prefix and voice.startswith(prefix):
+            return voice
+
+        # 尝试从别名映射查找
+        if voice in alias_map:
+            return alias_map[voice]
+
+        # 尝试使用默认值的别名
+        if default in alias_map:
+            return alias_map[default]
+
+        return default
+
+    @classmethod
+    def split_sentences(cls, text: str, min_length: int = 2) -> List[str]:
+        """
+        将文本分割成句子
+
+        Args:
+            text: 待分割文本
+            min_length: 最小句子长度，过短的句子会合并到前一句
+
+        Returns:
+            句子列表
+        """
+        if not text:
+            return []
+
+        # 使用中英文标点分割
+        # 保留分隔符以便后续处理
+        pattern = r'([。！？!?；;])'
+        parts = re.split(pattern, text)
+
+        sentences = []
+        current = ""
+
+        for i, part in enumerate(parts):
+            if not part:
+                continue
+
+            # 如果是标点符号，附加到当前句子
+            if re.match(pattern, part):
+                current += part
+            else:
+                # 如果当前句子不为空，先保存
+                if current.strip():
+                    sentences.append(current.strip())
+                current = part
+
+        # 处理最后一段
+        if current.strip():
+            sentences.append(current.strip())
+
+        # 合并过短的句子
+        if min_length > 0 and len(sentences) > 1:
+            merged = []
+            for sent in sentences:
+                if merged and len(sent) < min_length:
+                    # 合并到前一句
+                    merged[-1] += sent
+                else:
+                    merged.append(sent)
+            sentences = merged
+
+        return sentences