diff --git a/plugins/tts_voice_plugin/.gitignore b/plugins/tts_voice_plugin/.gitignore
new file mode 100644
index 00000000..f937ce2b
--- /dev/null
+++ b/plugins/tts_voice_plugin/.gitignore
@@ -0,0 +1,40 @@
+# 敏感配置文件
+config.toml
+config.toml.backup.*
+config.toml.reset.*
+
+# Python缓存文件
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+
+# 虚拟环境
+venv/
+ENV/
+env/
+
+# IDE配置
+.vscode/
+.idea/
+*.swp
+*.swo
+
+# 临时文件
+*.log
+*.tmp
+.DS_Store
+
+# 生成的音频文件
+tts_*.mp3
+tts_*.wav
+tts_*.ogg
+
+# 数据目录(包含临时音频文件)
+data/
+
+# 规范工作流目录
+.spec-workflow/
+
+# Claude配置
+.claude/
diff --git a/plugins/tts_voice_plugin/LICENSE b/plugins/tts_voice_plugin/LICENSE
new file mode 100644
index 00000000..0ad25db4
--- /dev/null
+++ b/plugins/tts_voice_plugin/LICENSE
@@ -0,0 +1,661 @@
+ GNU AFFERO GENERAL PUBLIC LICENSE
+ Version 3, 19 November 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc.
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU Affero General Public License is a free, copyleft license for
+software and other kinds of works, specifically designed to ensure
+cooperation with the community in the case of network server software.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+our General Public Licenses are intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ Developers that use our General Public Licenses protect your rights
+with two steps: (1) assert copyright on the software, and (2) offer
+you this License which gives you legal permission to copy, distribute
+and/or modify the software.
+
+ A secondary benefit of defending all users' freedom is that
+improvements made in alternate versions of the program, if they
+receive widespread use, become available for other developers to
+incorporate. Many developers of free software are heartened and
+encouraged by the resulting cooperation. However, in the case of
+software used on network servers, this result may fail to come about.
+The GNU General Public License permits making a modified version and
+letting the public access it on a server without ever releasing its
+source code to the public.
+
+ The GNU Affero General Public License is designed specifically to
+ensure that, in such cases, the modified source code becomes available
+to the community. It requires the operator of a network server to
+provide the source code of the modified version running there to the
+users of that server. Therefore, public use of a modified version, on
+a publicly accessible server, gives the public access to the source
+code of the modified version.
+
+ An older license, called the Affero General Public License and
+published by Affero, was designed to accomplish similar goals. This is
+a different license, not a version of the Affero GPL, but Affero has
+released a new version of the Affero GPL which permits relicensing under
+this license.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU Affero General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Remote Network Interaction; Use with the GNU General Public License.
+
+ Notwithstanding any other provision of this License, if you modify the
+Program, your modified version must prominently offer all users
+interacting with it remotely through a computer network (if your version
+supports such interaction) an opportunity to receive the Corresponding
+Source of your version by providing access to the Corresponding Source
+from a network server at no charge, through some standard or customary
+means of facilitating copying of software. This Corresponding Source
+shall include the Corresponding Source for any work covered by version 3
+of the GNU General Public License that is incorporated pursuant to the
+following paragraph.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the work with which it is combined will remain governed by version
+3 of the GNU General Public License.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU Affero General Public License from time to time. Such new versions
+will be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU Affero General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU Affero General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU Affero General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+
+ Copyright (C)
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published
+ by the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see .
+
+Also add information on how to contact you by electronic and paper mail.
+
+ If your software can interact with users remotely through a computer
+network, you should also make sure that it provides a way for users to
+get its source. For example, if your program is a web application, its
+interface could display a "Source" link that leads users to an archive
+of the code. There are many ways you could offer source, and different
+solutions will be better for different programs; see section 13 for the
+specific requirements.
+
+ You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU AGPL, see
+.
diff --git a/plugins/tts_voice_plugin/README.md b/plugins/tts_voice_plugin/README.md
new file mode 100644
index 00000000..9cd5f91c
--- /dev/null
+++ b/plugins/tts_voice_plugin/README.md
@@ -0,0 +1,311 @@
+# TTS 语音合成插件
+
+MaiBot 的文本转语音插件,支持多种 TTS 后端。
+
+## 支持的后端
+
+| 后端 | 说明 | 适用场景 |
+|------|------|----------|
+| AI Voice | MaiCore 内置,无需配置 | 仅群聊 |
+| GSV2P | 云端 API,需要 Token | 群聊/私聊 |
+| GPT-SoVITS | 本地服务,需自行部署 | 群聊/私聊 |
+| 豆包语音 | 火山引擎云服务,高质量 | 群聊/私聊 |
+| CosyVoice | 阿里云 CosyVoice3,支持方言和声音克隆 | 群聊/私聊 |
+| ComfyUI | 本地 ComfyUI 工作流 API(MLX Qwen3-TTS VoiceClone) | 群聊/私聊 |
+
+## 安装
+
+```bash
+pip install aiohttp gradio_client
+```
+
+## 配置
+
+编辑 `config.toml`,设置默认后端:
+
+```toml
+[general]
+default_backend = "cosyvoice" # 可选:ai_voice / gsv2p / gpt_sovits / doubao / cosyvoice / comfyui
+audio_output_dir = "" # 音频输出目录,留空使用项目根目录
+use_base64_audio = false # 是否使用base64发送(备选方案)
+split_sentences = true # 是否分段发送语音(长文本逐句发送)
+split_delay = 0.3 # 分段发送间隔时间(秒)
+send_error_messages = true # 是否发送错误提示消息(false=静默失败)
+```
+
+### Docker环境配置说明
+
+**问题:** Docker环境中可能遇到音频上传失败或文件路径识别错误(如`识别URL失败`)
+
+**解决方案(按推荐顺序):**
+
+#### 方案1:使用相对路径(推荐)
+
+```toml
+[general]
+audio_output_dir = "" # 留空,默认使用项目根目录
+```
+
+音频文件将保存在项目根目录,OneBot/NapCat可以正确识别相对路径。
+
+#### 方案2:自定义输出目录
+
+```toml
+[general]
+audio_output_dir = "data/tts_audio" # 相对路径,相对于项目根目录
+# 或
+audio_output_dir = "/app/data/audio" # 绝对路径
+```
+
+#### 方案3:使用base64编码(备选)
+
+如果路径方案都不生效,可启用base64发送:
+
+```toml
+[general]
+use_base64_audio = true # 使用base64编码发送(会增加约33%数据大小)
+```
+
+### 豆包语音配置
+
+```toml
+[doubao]
+app_id = "你的APP_ID"
+access_key = "你的ACCESS_KEY"
+resource_id = "seed-tts-2.0"
+default_voice = "zh_female_vv_uranus_bigtts"
+```
+
+**预置音色:**
+
+| 音色名称 | voice_type |
+|----------|------------|
+| vivi 2.0 | zh_female_vv_uranus_bigtts |
+| 大壹 | zh_male_dayi_saturn_bigtts |
+| 黑猫侦探社咪仔 | zh_female_mizai_saturn_bigtts |
+
+**复刻音色:** 将 `resource_id` 改为 `seed-icl-2.0`,`default_voice` 填音色 ID(如 `S_xxxxxx`)
+
+凭证获取:[火山引擎控制台](https://console.volcengine.com/speech/service/8)
+
+### GSV2P 配置
+
+```toml
+[gsv2p]
+api_token = "你的Token"
+default_voice = "原神-中文-派蒙_ZH"
+```
+
+Token 获取:[https://tts.acgnai.top](https://tts.acgnai.top)
+
+### AI Voice 配置
+
+```toml
+[ai_voice]
+default_character = "温柔妹妹"
+```
+
+可用音色:小新、猴哥、妲己、酥心御姐、温柔妹妹、邻家小妹 等 22 种
+
+### GPT-SoVITS 配置
+
+**支持两种配置格式:**
+
+#### 格式1:数组格式(推荐,WebUI 友好)
+
+```toml
+[gpt_sovits]
+server = "http://127.0.0.1:9880"
+
+[[gpt_sovits.styles]]
+name = "default"
+refer_wav = "/path/to/reference.wav"
+prompt_text = "参考文本"
+prompt_language = "zh"
+gpt_weights = "/path/to/model.ckpt" # 可选:动态模型切换
+sovits_weights = "/path/to/model.pth" # 可选:动态模型切换
+
+[[gpt_sovits.styles]]
+name = "happy"
+refer_wav = "/path/to/happy.wav"
+prompt_text = "开心的参考文本"
+prompt_language = "zh"
+```
+
+#### 格式2:字典格式(兼容旧版)
+
+```toml
+[gpt_sovits]
+server = "http://127.0.0.1:9880"
+
+[gpt_sovits.styles.default]
+refer_wav = "/path/to/reference.wav"
+prompt_text = "参考文本"
+prompt_language = "zh"
+gpt_weights = "/path/to/model.ckpt"
+sovits_weights = "/path/to/model.pth"
+```
+
+> **提示:** 插件会自动识别并兼容两种格式,推荐使用数组格式以获得更好的 WebUI 支持。
+
+### CosyVoice 配置
+
+```toml
+[cosyvoice]
+gradio_url = "https://funaudiollm-fun-cosyvoice3-0-5b.ms.show/"
+default_mode = "3s极速复刻" # 或 "自然语言控制"
+default_instruct = "You are a helpful assistant. 请用广东话表达。<|endofprompt|>" # 只有自然语言控制模式才会生效,3s极速复刻模式下不生效
+reference_audio = "/path/to/ref.wav" # 参考音频路径
+prompt_text = "参考音频对应的文本" # 参考音频的对应文本
+timeout = 300 # API超时(秒)
+```
+
+**支持的方言/情感/语速:**
+
+| 类型 | 可用选项 |
+|------|----------|
+| 方言 | 广东话、东北话、四川话、上海话、闽南话、山东话、陕西话、湖南话等17种 |
+| 情感 | 开心、伤心、生气 |
+| 语速 | 慢速、快速 |
+| 音量 | 大声、小声 |
+| 特殊风格 | 小猪佩奇、机器人 |
+
+**推理模式:**
+- `3s极速复刻`:需要提供参考音频进行声音克隆
+- `自然语言控制`:通过指令控制方言、情感、语速等
+
+## 使用方法
+
+### 命令触发
+
+```
+/tts 你好世界 # 使用默认后端
+/tts 今天天气不错 小新 # 指定音色
+/gsv2p 你好世界 # 使用 GSV2P
+/doubao 你好世界 # 使用豆包
+/cosyvoice 你好世界 四川话 # 使用 CosyVoice,四川话
+/comfyui 你好世界 -v default # 使用 ComfyUI 本地工作流(MLX VoiceClone)
+```
+
+## ComfyUI 后端配置
+
+该后端通过 ComfyUI 的 HTTP API 执行工作流(`/prompt` -> `/history` -> `/view`),并用 `LoadAudio` 从 ComfyUI 的 `input` 目录读取参考音频。
+
+```toml
+[comfyui]
+server = "http://127.0.0.1:8188"
+input_dir = "/Users/xenon/Downloads/seiun_tts/ComfyUI/ComfyUI/input"
+timeout = 120
+audio_quality = "128k" # SaveAudioMP3: V0/128k/320k
+mlx_python = "/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/.venv/bin/python"
+mlx_cli = "/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/mlx_voice_clone_cli.py"
+default_style = "default"
+
+[[comfyui.styles]]
+name = "default"
+refer_wav = "/path/to/ref.wav"
+prompt_text = "参考音频逐字稿"
+language = "Auto" # 可选: Auto/Chinese/English/Japanese...
+model_choice = "1.7B"
+precision = "bf16"
+seed = 0
+max_new_tokens = 2048
+top_p = 0.8
+top_k = 20
+temperature = 1.0
+repetition_penalty = 1.05
+```
+
+### 自动触发
+
+LLM 判断需要语音回复时会自动触发,可通过概率控制:
+
+```toml
+[probability]
+enabled = false # 默认关闭,每次都触发语音
+base_probability = 0.3 # 启用时 30% 概率触发
+```
+
+### 智能分割插件支持
+
+本插件已适配智能分割插件,支持使用 `|||SPLIT|||` 分隔符进行精确分段:
+
+- **优先级**:智能分割标记 > 自动句子分割 > 单句发送
+- **使用方式**:智能分割插件会在适当位置插入 `|||SPLIT|||` 标记,本插件自动识别并按标记分段发送
+- **示例**:`今天天气不错|||SPLIT|||适合出去玩|||SPLIT|||你觉得呢` 会分成三段语音依次发送
+
+## 项目结构
+
+```
+tts_voice_plugin/
+├── plugin.py # 插件入口
+├── config.toml # 配置文件
+├── backends/ # 后端实现
+│ ├── ai_voice.py
+│ ├── gsv2p.py
+│ ├── gpt_sovits.py
+│ ├── doubao.py
+│ └── cosyvoice.py
+└── utils/ # 工具函数
+```
+
+## 常见问题
+
+**Q: Docker环境中提示"文件处理失败 识别URL失败"?**
+A: 留空 `audio_output_dir` 配置项,插件将使用项目根目录保存音频(相对路径)。如仍有问题,可设置 `use_base64_audio = true` 使用base64编码发送。
+
+**Q: AI Voice 提示"仅支持群聊"?**
+A: AI Voice 只能在群聊使用,私聊会自动切换到其他后端。
+
+**Q: 豆包语音怎么获取凭证?**
+A: 登录火山引擎控制台,开通语音合成服务获取。
+
+**Q: 文本太长被截断?**
+A: 修改 `config.toml` 中 `max_text_length = 1000`
+
+**Q: 语音合成失败时不想让Bot发送错误消息?**
+A: 设置 `send_error_messages = false`,语音合成失败时将静默处理,不向用户发送错误提示。
+
+## 更新日志
+
+### v3.2.3
+- 修复豆包语音 WAV 流式响应合并问题(正确处理 LIST/INFO 元数据块和多 header 情况)
+- 默认后端改为 CosyVoice(更稳定的声音克隆体验)
+- 默认关闭概率控制(每次触发都生成语音,更可预期的行为)
+- 优化 LLM 长度约束提示(利用"近因效应"提高遵守率)
+- 优化 action 记录格式,帮助 planner 避免重复执行
+- GSV2P/豆包音频格式默认改为 WAV(更好的兼容性)
+- CosyVoice 默认模式改为 3s 极速复刻(更快响应)
+- 更新默认超时配置(CosyVoice 300s, GSV2P 120s)
+
+### v3.2.2
+- 适配智能分割插件(支持 `|||SPLIT|||` 分隔符精确分段)
+- GPT-SoVITS 支持数组格式配置(WebUI 友好,向后兼容字典格式)
+- 修复豆包语音音色信息显示乱码问题
+- 优化配置文件注释,更简洁清晰
+- 优化分段发送逻辑优先级(智能分割 > 自动分割 > 单句)
+- 禁用 Python 字节码生成(保持目录干净)
+- 添加插件 ID 标识字段
+
+### v3.2.1
+- 新增 `send_error_messages` 配置项(可选择关闭错误提示消息)
+- 统一错误消息处理逻辑(通过 `_send_error` 方法)
+
+### v3.2.0
+- 新增 CosyVoice 后端(阿里云 ModelScope,支持 17 种方言、3 秒声音克隆)
+- 新增分段发送功能(长文本自动分割逐句发送)
+- GPT-SoVITS 支持动态模型切换(在风格配置中指定 gpt_weights/sovits_weights)
+- GSV2P 后端新增重试机制(5 次重试,3 秒间隔)
+- 新增 `/cosyvoice` 命令
+- 新增 gradio_client 依赖
+
+### v3.1.0
+- 新增豆包语音后端(火山引擎云服务)
+- 重构为模块化架构
+- HTTP Session 复用优化
+
+## 信息
+
+- 版本:3.2.3
+- 作者:靓仔
+- 许可:AGPL-v3.0
diff --git a/plugins/tts_voice_plugin/_manifest.json b/plugins/tts_voice_plugin/_manifest.json
new file mode 100644
index 00000000..d640b6a3
--- /dev/null
+++ b/plugins/tts_voice_plugin/_manifest.json
@@ -0,0 +1,235 @@
+{
+ "manifest_version": 1,
+ "name": "统一TTS语音合成插件",
+ "version": "3.2.3",
+ "description": "统一TTS语音合成插件,整合AI Voice、GSV2P、GPT-SoVITS、豆包语音、CosyVoice五种后端引擎,提供灵活的语音合成能力。",
+ "author": {
+ "name": "靓仔",
+ "url": "https://github.com/xuqian13"
+ },
+ "license": "AGPL-v3.0",
+ "homepage_url": "",
+ "repository_url": "https://github.com/xuqian13/tts_voice_plugin",
+ "keywords": [
+ "TTS",
+ "语音合成",
+ "文本转语音",
+ "AI语音",
+ "GSV2P",
+ "GPT-SoVITS",
+ "豆包",
+ "CosyVoice",
+ "火山引擎",
+ "多后端",
+ "语音",
+ "朗读",
+ "音色",
+ "语音播报",
+ "方言",
+ "声音克隆",
+ "MaiCore"
+ ],
+ "categories": [
+ "语音",
+ "AI",
+ "聊天增强",
+ "娱乐",
+ "Utility",
+ "Communication",
+ "Accessibility"
+ ],
+ "host_application": {
+ "min_version": "0.12.0"
+ },
+ "default_locale": "zh-CN",
+ "plugin_info": {
+ "is_built_in": false,
+ "plugin_type": "general",
+ "components": [
+ {
+ "type": "action",
+ "name": "unified_tts_action",
+ "description": "统一TTS语音合成Action,支持四种后端引擎智能切换,LLM自主判断触发"
+ },
+ {
+ "type": "command",
+ "name": "unified_tts_command",
+ "description": "统一TTS命令,支持/tts、/voice、/gsv2p、/doubao多种命令格式,灵活指定后端和音色"
+ }
+ ],
+ "features": [
+ "支持五种TTS后端:AI Voice、GSV2P、GPT-SoVITS、豆包语音、CosyVoice",
+ "AI Voice: MaiCore内置,简单快速,22+预设音色",
+ "GSV2P: 云端API,高质量合成,丰富的调节参数",
+ "GPT-SoVITS: 本地服务,高度定制化,多风格支持",
+ "豆包语音: 字节跳动云服务,支持复刻音色和情感控制",
+ "CosyVoice: 阿里云语音合成,支持17种方言、3秒声音克隆、情感控制",
+ "模块化架构,后端独立实现,易于扩展",
+ "HTTP Session复用,提升性能",
+ "临时文件自动清理,避免并发冲突",
+ "智能触发模式(LLM自主判断)和手动命令模式",
+ "概率控制机制,避免语音回复过于频繁",
+ "智能语言检测(中文/英文/日文)",
+ "文本自动清理和网络用语转换",
+ "完善的错误处理和重试机制",
+ "灵活的配置系统,支持各后端独立配置"
+ ],
+ "dependencies": {
+ "python": [
+ "aiohttp",
+ "gradio_client"
+ ],
+ "system": [],
+ "plugins": []
+ },
+ "backend_info": {
+ "ai_voice": {
+ "provider": "MaiCore内置",
+ "endpoint": "AI_VOICE_SEND命令",
+ "authentication": "无需认证",
+ "limitations": "仅支持群聊使用",
+ "voices": "22+预设音色(小新、妲己、酥心御姐等)"
+ },
+ "gsv2p": {
+ "provider": "GSV2P云服务",
+ "endpoint": "https://gsv2p.acgnai.top/v1/audio/speech",
+ "authentication": "需要API Token",
+ "limitations": "API调用限制",
+ "features": "高质量合成、多语言支持、丰富参数调节"
+ },
+ "gpt_sovits": {
+ "provider": "本地GPT-SoVITS服务",
+ "endpoint": "http://127.0.0.1:9880",
+ "authentication": "无需认证",
+ "limitations": "需要本地部署服务",
+ "features": "高度定制化、多风格支持、模型权重切换"
+ },
+ "doubao": {
+ "provider": "字节跳动火山引擎",
+ "endpoint": "https://openspeech.bytedance.com/api/v3/tts/unidirectional",
+ "authentication": "需要app_id、access_key、resource_id",
+ "limitations": "需要火山引擎账号",
+ "features": "快速高质量、支持复刻音色、情感语气控制"
+ },
+ "cosyvoice": {
+ "provider": "阿里云 CosyVoice",
+ "endpoint": "ModelScope Gradio API",
+ "authentication": "无需认证(公开Gradio接口)",
+ "limitations": "依赖ModelScope服务可用性",
+ "features": "3秒声音克隆、17种方言支持、情感语速控制、自然语言指令"
+ }
+ }
+ },
+ "configuration": {
+ "config_file": "config.toml",
+ "config_template": "config.toml.example",
+ "auto_generate": true,
+ "sections": [
+ {
+ "name": "plugin",
+ "description": "插件基本配置"
+ },
+ {
+ "name": "general",
+ "description": "通用设置(默认后端、超时、文本长度等)"
+ },
+ {
+ "name": "components",
+ "description": "组件启用控制"
+ },
+ {
+ "name": "probability",
+ "description": "概率控制配置(避免语音回复过于频繁)"
+ },
+ {
+ "name": "ai_voice",
+ "description": "AI Voice后端配置(音色映射等)"
+ },
+ {
+ "name": "gsv2p",
+ "description": "GSV2P后端配置(API地址、Token、参数等)"
+ },
+ {
+ "name": "gpt_sovits",
+ "description": "GPT-SoVITS后端配置(服务地址、风格配置等)"
+ },
+ {
+ "name": "doubao",
+ "description": "豆包语音后端配置(火山引擎认证、音色、情感等)"
+ },
+ {
+ "name": "cosyvoice",
+ "description": "CosyVoice后端配置(Gradio URL、模式、方言等)"
+ }
+ ]
+ },
+ "usage_examples": [
+ {
+ "type": "action",
+ "backend": "auto",
+ "description": "LLM自动触发语音回复",
+ "example": "用户:请用语音说\"你好世界\"\n机器人:[使用默认后端自动生成语音文件并发送]"
+ },
+ {
+ "type": "command",
+ "backend": "ai_voice",
+ "description": "手动命令使用AI Voice",
+ "example": "/tts 你好世界 小新"
+ },
+ {
+ "type": "command",
+ "backend": "gsv2p",
+ "description": "手动命令使用GSV2P",
+ "example": "/gsv2p 今天天气不错"
+ },
+ {
+ "type": "command",
+ "backend": "doubao",
+ "description": "手动命令使用豆包语音",
+ "example": "/doubao 你好世界"
+ },
+ {
+ "type": "command",
+ "backend": "gpt_sovits",
+ "description": "手动命令使用GPT-SoVITS",
+ "example": "/tts 测试一下 default gpt_sovits"
+ },
+ {
+ "type": "command",
+ "backend": "cosyvoice",
+ "description": "手动命令使用CosyVoice",
+ "example": "/cosyvoice 你好世界 四川话"
+ },
+ {
+ "type": "command",
+ "backend": "auto",
+ "description": "使用默认后端",
+ "example": "/tts 你好世界"
+ }
+ ],
+ "migration_info": {
+ "from_plugins": [
+ "ai_voice_plugin (v1.0.0)",
+ "gsv2p_tts_plugin (v1.0.0)",
+ "tts_voice_plugin (v2.0.0)",
+ "tts_voice_plugin (v3.0.0)"
+ ],
+ "migration_notes": [
+ "本插件整合了ai_voice_plugin、gsv2p_tts_plugin和旧版tts_voice_plugin的所有功能",
+ "v3.2.2适配智能分割插件(支持|||SPLIT|||分隔符精确分段)",
+ "v3.2.2支持GPT-SoVITS数组格式配置(WebUI友好,向后兼容字典格式)",
+ "v3.2.2修复豆包语音音色信息显示乱码问题",
+ "v3.2.2优化配置文件注释,更简洁清晰",
+ "v3.2.0新增CosyVoice后端支持(阿里云语音合成,支持17种方言和3秒声音克隆)",
+ "v3.1.0新增豆包语音后端支持",
+ "v3.1.0重构为模块化架构,提升代码可维护性",
+ "配置文件需要重新生成,原配置需手动迁移",
+ "建议备份旧插件配置后再迁移",
+ "AI Voice音色映射保持兼容",
+ "GSV2P API配置需重新填写Token",
+ "GPT-SoVITS风格配置需要重新设置",
+ "新增config.toml.example模板文件"
+ ]
+ },
+ "id": "tts_voice_plugin"
+}
\ No newline at end of file
diff --git a/plugins/tts_voice_plugin/backends/__init__.py b/plugins/tts_voice_plugin/backends/__init__.py
new file mode 100644
index 00000000..ddcafef1
--- /dev/null
+++ b/plugins/tts_voice_plugin/backends/__init__.py
@@ -0,0 +1,38 @@
+"""
+TTS后端模块
+"""
+
+import sys
+sys.dont_write_bytecode = True
+
+from .base import TTSBackendBase, TTSBackendRegistry, TTSResult
+from .ai_voice import AIVoiceBackend
+from .gsv2p import GSV2PBackend
+from .gpt_sovits import GPTSoVITSBackend
+from .doubao import DoubaoBackend
+from .cosyvoice import CosyVoiceBackend
+from .comfyui import ComfyUIBackend, ComfyUIVoiceCloneBackend, ComfyUICustomVoiceBackend
+
+# 注册后端
+TTSBackendRegistry.register("ai_voice", AIVoiceBackend)
+TTSBackendRegistry.register("gsv2p", GSV2PBackend)
+TTSBackendRegistry.register("gpt_sovits", GPTSoVITSBackend)
+TTSBackendRegistry.register("doubao", DoubaoBackend)
+TTSBackendRegistry.register("cosyvoice", CosyVoiceBackend)
+TTSBackendRegistry.register("comfyui", ComfyUIBackend)
+TTSBackendRegistry.register("comfyui_voiceclone", ComfyUIVoiceCloneBackend)
+TTSBackendRegistry.register("comfyui_customvoice", ComfyUICustomVoiceBackend)
+
+__all__ = [
+ "TTSBackendBase",
+ "TTSBackendRegistry",
+ "TTSResult",
+ "AIVoiceBackend",
+ "GSV2PBackend",
+ "GPTSoVITSBackend",
+ "DoubaoBackend",
+ "CosyVoiceBackend",
+ "ComfyUIBackend",
+ "ComfyUIVoiceCloneBackend",
+ "ComfyUICustomVoiceBackend",
+]
diff --git a/plugins/tts_voice_plugin/backends/ai_voice.py b/plugins/tts_voice_plugin/backends/ai_voice.py
new file mode 100644
index 00000000..c916fa00
--- /dev/null
+++ b/plugins/tts_voice_plugin/backends/ai_voice.py
@@ -0,0 +1,133 @@
+"""
+AI Voice 后端实现
+使用 MaiCore 内置的 AI 语音功能
+"""
+
+from typing import Optional, Callable, Dict
+from .base import TTSBackendBase, TTSResult
+from ..utils.text import TTSTextUtils
+from ..config_keys import ConfigKeys
+from src.common.logger import get_logger
+
+logger = get_logger("tts_ai_voice")
+
+# AI Voice 音色映射表
+AI_VOICE_ALIAS_MAP = {
+ "小新": "lucy-voice-laibixiaoxin",
+ "猴哥": "lucy-voice-houge",
+ "四郎": "lucy-voice-silang",
+ "东北老妹儿": "lucy-voice-guangdong-f1",
+ "广西大表哥": "lucy-voice-guangxi-m1",
+ "妲己": "lucy-voice-daji",
+ "霸道总裁": "lucy-voice-lizeyan",
+ "酥心御姐": "lucy-voice-suxinjiejie",
+ "说书先生": "lucy-voice-m8",
+ "憨憨小弟": "lucy-voice-male1",
+ "憨厚老哥": "lucy-voice-male3",
+ "吕布": "lucy-voice-lvbu",
+ "元气少女": "lucy-voice-xueling",
+ "文艺少女": "lucy-voice-f37",
+ "磁性大叔": "lucy-voice-male2",
+ "邻家小妹": "lucy-voice-female1",
+ "低沉男声": "lucy-voice-m14",
+ "傲娇少女": "lucy-voice-f38",
+ "爹系男友": "lucy-voice-m101",
+ "暖心姐姐": "lucy-voice-female2",
+ "温柔妹妹": "lucy-voice-f36",
+ "书香少女": "lucy-voice-f34"
+}
+
+
+class AIVoiceBackend(TTSBackendBase):
+ """
+ AI Voice 后端
+
+ 使用 MaiCore 内置的 AI 语音功能
+ 注意:仅支持群聊环境
+ """
+
+ backend_name = "ai_voice"
+ backend_description = "MaiCore内置AI语音(仅群聊)"
+ support_private_chat = False # 不支持私聊
+ default_audio_format = "" # AI Voice不需要音频格式
+
+ def __init__(self, config_getter, log_prefix: str = ""):
+ super().__init__(config_getter, log_prefix)
+ self._send_command = None # 由外部注入
+
+ def set_send_command(self, send_command_func: Callable) -> None:
+ """设置发送命令的函数(由Action/Command注入)"""
+ self._send_command = send_command_func
+
+ def get_default_voice(self) -> str:
+ """获取默认音色"""
+ return self.get_config(ConfigKeys.AI_VOICE_DEFAULT_CHARACTER, "温柔妹妹")
+
+ def resolve_voice(self, voice: Optional[str]) -> str:
+ """解析音色别名"""
+ alias_map: Dict[str, str] = self.get_config(
+ ConfigKeys.AI_VOICE_ALIAS_MAP,
+ AI_VOICE_ALIAS_MAP
+ )
+ default_voice = self.get_default_voice()
+ return TTSTextUtils.resolve_voice_alias(
+ voice,
+ alias_map,
+ default_voice,
+ prefix="lucy-voice-"
+ )
+
+ async def execute(
+ self,
+ text: str,
+ voice: Optional[str] = None,
+ **kwargs
+ ) -> TTSResult:
+ """
+ 执行AI Voice语音合成
+
+ Args:
+ text: 待转换的文本
+ voice: 音色名称或别名
+
+ Returns:
+ TTSResult
+ """
+ if not self._send_command:
+ return TTSResult(
+ success=False,
+ message="AI Voice后端未正确初始化(缺少send_command)",
+ backend_name=self.backend_name
+ )
+
+ # 解析音色
+ character = self.resolve_voice(voice)
+
+ try:
+ success = await self._send_command(
+ command_name="AI_VOICE_SEND",
+ args={"text": text, "character": character},
+ storage_message=False
+ )
+
+ if success:
+ logger.info(f"{self.log_prefix} AI语音发送成功 (音色: {character})")
+ return TTSResult(
+ success=True,
+ message=f"成功发送AI语音 (音色: {character})",
+ backend_name=self.backend_name
+ )
+ else:
+ return TTSResult(
+ success=False,
+ message="AI语音命令发送失败",
+ backend_name=self.backend_name
+ )
+
+ except Exception as e:
+ logger.error(f"{self.log_prefix} AI语音执行错误: {e}")
+ return TTSResult(
+ success=False,
+ message=f"AI语音执行错误: {e}",
+ backend_name=self.backend_name
+ )
diff --git a/plugins/tts_voice_plugin/backends/base.py b/plugins/tts_voice_plugin/backends/base.py
new file mode 100644
index 00000000..9d8936f4
--- /dev/null
+++ b/plugins/tts_voice_plugin/backends/base.py
@@ -0,0 +1,239 @@
+"""
+TTS后端抽象基类和注册表
+"""
+
+import asyncio
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Dict, Type, Optional, Any, Callable, Tuple, Union
+from src.common.logger import get_logger
+from ..config_keys import ConfigKeys
+
+logger = get_logger("tts_backend")
+
+
+@dataclass
+class TTSResult:
+ """TTS执行结果"""
+ success: bool
+ message: str
+ audio_path: Optional[str] = None
+ backend_name: str = ""
+
+ def __iter__(self):
+ """支持解包为 (success, message)"""
+ return iter((self.success, self.message))
+
+
+class TTSBackendBase(ABC):
+ """
+ TTS后端抽象基类
+
+ 所有TTS后端必须继承此类并实现 execute 方法
+ """
+
+ # 后端名称(子类必须覆盖)
+ backend_name: str = "base"
+
+ # 后端描述
+ backend_description: str = "TTS后端基类"
+
+ # 是否支持私聊
+ support_private_chat: bool = True
+
+ # 默认音频格式
+ default_audio_format: str = "mp3"
+
+ def __init__(self, config_getter: Callable[[str, Any], Any], log_prefix: str = ""):
+ """
+ 初始化后端
+
+ Args:
+ config_getter: 配置获取函数,签名为 get_config(key, default)
+ log_prefix: 日志前缀
+ """
+ self.get_config = config_getter
+ self.log_prefix = log_prefix or f"[{self.backend_name}]"
+ self._send_custom = None
+
+ def set_send_custom(self, send_custom_func: Callable) -> None:
+ """设置发送自定义消息的函数"""
+ self._send_custom = send_custom_func
+
+ async def send_audio(
+ self,
+ audio_data: bytes,
+ audio_format: str = "mp3",
+ prefix: str = "tts",
+ voice_info: str = ""
+ ) -> TTSResult:
+ """
+ 统一的音频发送方法
+
+ Args:
+ audio_data: 音频二进制数据
+ audio_format: 音频格式(如mp3、wav)
+ prefix: 文件名前缀
+ voice_info: 音色信息(用于日志)
+
+ Returns:
+ TTSResult
+ """
+ from ..utils.file import TTSFileManager
+
+ # 检查是否使用base64发送
+ use_base64 = self.get_config(ConfigKeys.GENERAL_USE_BASE64_AUDIO, False)
+ logger.debug(f"{self.log_prefix} 开始发送音频 (原始大小: {len(audio_data)}字节, 格式: {audio_format})")
+
+ if use_base64:
+ # 使用base64编码发送
+ base64_audio = TTSFileManager.audio_to_base64(audio_data)
+ if not base64_audio:
+ return TTSResult(False, "音频数据转base64失败", backend_name=self.backend_name)
+
+ logger.debug(f"{self.log_prefix} base64编码完成,准备通过send_custom发送")
+ if self._send_custom:
+ await self._send_custom(message_type="voice", content=base64_audio)
+ logger.info(f"{self.log_prefix} 语音已通过send_custom发送 (base64模式, 音频大小: {len(audio_data)}字节)")
+ else:
+ logger.warning(f"{self.log_prefix} send_custom未设置,无法发送语音")
+ return TTSResult(False, "send_custom回调未设置", backend_name=self.backend_name)
+
+ return TTSResult(
+ success=True,
+ message=f"成功发送{self.backend_name}语音{(' ('+voice_info+')') if voice_info else ''}, base64模式",
+ backend_name=self.backend_name
+ )
+ else:
+ # 使用文件路径发送
+ output_dir = self.get_config(ConfigKeys.GENERAL_AUDIO_OUTPUT_DIR, "")
+ audio_path = TTSFileManager.generate_temp_path(
+ prefix=prefix,
+ suffix=f".{audio_format}",
+ output_dir=output_dir
+ )
+
+ if not await TTSFileManager.write_audio_async(audio_path, audio_data):
+ return TTSResult(False, "保存音频文件失败", backend_name=self.backend_name)
+
+ logger.debug(f"{self.log_prefix} 音频文件已保存, 路径: {audio_path}")
+ # 发送语音
+ if self._send_custom:
+ await self._send_custom(message_type="voiceurl", content=audio_path)
+ logger.info(f"{self.log_prefix} 语音已通过send_custom发送 (文件路径模式, 路径: {audio_path})")
+ # 延迟清理临时文件
+ asyncio.create_task(TTSFileManager.cleanup_file_async(audio_path, delay=30))
+ else:
+ logger.warning(f"{self.log_prefix} send_custom未设置,无法发送语音")
+ return TTSResult(False, "send_custom回调未设置", backend_name=self.backend_name)
+
+ return TTSResult(
+ success=True,
+ message=f"成功发送{self.backend_name}语音{(' ('+voice_info+')') if voice_info else ''}",
+ audio_path=audio_path,
+ backend_name=self.backend_name
+ )
+
+ @abstractmethod
+ async def execute(
+ self,
+ text: str,
+ voice: Optional[str] = None,
+ **kwargs
+ ) -> TTSResult:
+ """
+ 执行TTS转换
+
+ Args:
+ text: 待转换的文本
+ voice: 音色/风格
+ **kwargs: 其他参数(如emotion等)
+
+ Returns:
+ TTSResult 包含执行结果
+ """
+ raise NotImplementedError
+
+ def validate_config(self) -> Tuple[bool, str]:
+ """
+ 验证后端配置是否完整
+
+ Returns:
+ (is_valid, error_message)
+ """
+ return True, ""
+
+ def get_default_voice(self) -> str:
+ """获取默认音色"""
+ return ""
+
+ def is_available(self) -> bool:
+ """检查后端是否可用"""
+ is_valid, _ = self.validate_config()
+ return is_valid
+
+
+class TTSBackendRegistry:
+ """
+ TTS后端注册表
+
+ 使用策略模式 + 工厂模式管理后端
+ """
+
+ _backends: Dict[str, Type[TTSBackendBase]] = {}
+
+ @classmethod
+ def register(cls, name: str, backend_class: Type[TTSBackendBase]) -> None:
+ """
+ 注册后端
+
+ Args:
+ name: 后端名称
+ backend_class: 后端类
+ """
+ cls._backends[name] = backend_class
+ logger.debug(f"注册TTS后端: {name}")
+
+ @classmethod
+ def unregister(cls, name: str) -> None:
+ """注销后端"""
+ if name in cls._backends:
+ del cls._backends[name]
+
+ @classmethod
+ def get(cls, name: str) -> Optional[Type[TTSBackendBase]]:
+ """获取后端类"""
+ return cls._backends.get(name)
+
+ @classmethod
+ def create(
+ cls,
+ name: str,
+ config_getter: Callable[[str, Any], Any],
+ log_prefix: str = ""
+ ) -> Optional[TTSBackendBase]:
+ """
+ 创建后端实例
+
+ Args:
+ name: 后端名称
+ config_getter: 配置获取函数
+ log_prefix: 日志前缀
+
+ Returns:
+ 后端实例或None
+ """
+ backend_class = cls.get(name)
+ if backend_class:
+ return backend_class(config_getter, log_prefix)
+ return None
+
+ @classmethod
+ def list_backends(cls) -> list[str]:
+ """列出所有已注册的后端名称"""
+ return list(cls._backends.keys())
+
+ @classmethod
+ def is_registered(cls, name: str) -> bool:
+ """检查后端是否已注册"""
+ return name in cls._backends
diff --git a/plugins/tts_voice_plugin/backends/comfyui.py b/plugins/tts_voice_plugin/backends/comfyui.py
new file mode 100644
index 00000000..d574e9fe
--- /dev/null
+++ b/plugins/tts_voice_plugin/backends/comfyui.py
@@ -0,0 +1,827 @@
+"""
+ComfyUI backend (Workflow API).
+
+This backend calls a fixed ComfyUI prompt graph that:
+LoadAudio -> MLX_Qwen3TTSVoiceClone -> SaveAudioMP3
+
+Rationale:
+- ComfyUI expects API-format "prompt" graphs (not UI workflow json).
+- For audio inputs, the simplest reliable path is to copy the reference audio into ComfyUI/input
+ and use the built-in LoadAudio node.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import hashlib
+import os
+import re
+import time
+import uuid
+from typing import Any, ClassVar, Dict, Optional, Tuple
+from urllib.parse import urlencode
+
+from src.common.logger import get_logger
+from src.plugin_system.apis import generator_api
+
+from .base import TTSBackendBase, TTSResult
+from ..config_keys import ConfigKeys
+from ..utils.file import TTSFileManager
+from ..utils.session import TTSSessionManager
+from ..utils.text import TTSTextUtils
+
+logger = get_logger("tts_comfyui")
+
+
+LANG_TO_DEMO = {
+ "zh": "Chinese",
+ "ja": "Japanese",
+ "en": "English",
+}
+
+
+class ComfyUIBackend(TTSBackendBase):
+ backend_name = "comfyui"
+ backend_description = "ComfyUI 工作流 API(MLX Qwen3-TTS VoiceClone/CustomVoice)"
+ support_private_chat = True
+ default_audio_format = "mp3"
+
+ _ref_cache: ClassVar[Dict[str, str]] = {}
+ _instruct_cache: ClassVar[Dict[str, str]] = {}
+ # If set by subclasses, only these modes are allowed (e.g. {"voice_clone"}).
+ allowed_modes: ClassVar[Optional[set[str]]] = None
+
+ def get_default_voice(self) -> str:
+ return self.get_config(ConfigKeys.COMFYUI_DEFAULT_STYLE, "default")
+
+ def _filter_styles_by_mode(self, styles: Dict[str, Any]) -> Dict[str, Any]:
+ allowed = self.allowed_modes
+ if not allowed:
+ return styles
+ out: Dict[str, Any] = {}
+ for name, st in (styles or {}).items():
+ if not isinstance(st, dict):
+ continue
+ mode = str(st.get("mode") or "voice_clone").strip()
+ if mode in allowed:
+ out[name] = st
+ return out
+
+ def _normalize_styles_config(self, styles_config: Any) -> Dict[str, Any]:
+ # Match GPT-SoVITS backend style schema: list[{name,...}] or dict{name:{...}}
+ if isinstance(styles_config, dict):
+ return styles_config
+ if isinstance(styles_config, list):
+ result = {}
+ for style in styles_config:
+ if isinstance(style, dict) and "name" in style:
+ name = style["name"]
+ result[name] = {k: v for k, v in style.items() if k != "name"}
+ return result
+ return {}
+
+ def _clean_instruct(self, s: str, max_chars: int) -> str:
+ s = (s or "").strip()
+ if not s:
+ return ""
+
+ # Strip common wrappers.
+ s = s.replace("```", "").strip()
+ s = re.sub(r"^instruct\\s*[::]\\s*", "", s, flags=re.IGNORECASE).strip()
+
+ # Prefer first non-empty line.
+ for line in s.splitlines():
+ line = line.strip()
+ if line:
+ s = line
+ break
+
+ # Trim quotes.
+ if len(s) >= 2 and ((s[0] == s[-1] == '"') or (s[0] == s[-1] == "'")):
+ s = s[1:-1].strip()
+
+ if max_chars and len(s) > max_chars:
+ s = s[:max_chars].rstrip()
+ return s
+
+ def _clean_base_tone(self, s: str) -> str:
+ """
+ Clean a base tone/persona string so it can safely live inside `基调=...`:
+ - single-line
+ - no semicolons (they are field separators)
+ - no '=' (KV separator)
+ """
+ s = (s or "").strip()
+ if not s:
+ return ""
+ s = s.replace("\r", " ").replace("\n", " ")
+ s = re.sub(r"\\s+", " ", s).strip()
+ # Avoid breaking KV parsing.
+ s = s.replace(";", ",").replace(";", ",")
+ s = s.replace("=", " ").replace("=", " ")
+ return s.strip(" ,,")
+
+ def _attach_base_tone(self, instruct: str, max_chars: int) -> str:
+ """
+ If configured, prefix inferred instruct with a fixed base tone/persona:
+ `基调=<...>;情绪=...;语速=...;停顿=...`
+
+ Priority when trimming: keep the inferred instruct fields intact if possible.
+ """
+ base_raw = self.get_config(ConfigKeys.COMFYUI_AUTO_INSTRUCT_BASE_TONE, "") or ""
+ base = self._clean_base_tone(str(base_raw))
+ if not base:
+ return (instruct or "").strip()
+
+ s = (instruct or "").strip()
+ fields = self._parse_instruct_fields(s)
+ if "基调" in fields:
+ return s
+
+ prefix = f"基调={base}"
+ if not s:
+ return prefix[:max_chars].rstrip() if max_chars else prefix
+
+ combined = f"{prefix};{s}"
+ if not max_chars or len(combined) <= max_chars:
+ return combined
+
+ # Too long: try trimming base first, keeping inferred instruct intact.
+ remain = max_chars - len(s) - len(";") - len("基调=")
+ if remain <= 0:
+ # Can't fit base at all; keep instruct (already max_chars-limited upstream).
+ return s[:max_chars].rstrip()
+ base_trim = base[:remain].rstrip(" ,,")
+ return f"基调={base_trim};{s}"
+
+ def _parse_instruct_fields(self, instruct: str) -> Dict[str, str]:
+ """
+ Parse a 1-line instruct like:
+ 情绪=愤怒;语速=很快;停顿=很少;表现=咬牙切齿
+
+ We only *use* a few keys (情绪/语速/停顿/强度/表现...), but keep it generic.
+ """
+ s = (instruct or "").strip()
+ if not s:
+ return {}
+
+ # Normalize separators (full-width punctuation).
+ s = s.replace(";", ";").replace(":", ":").replace("=", "=")
+
+ # Split by semicolon/comma-like separators.
+ parts = [p.strip() for p in re.split(r"[;]+", s) if p.strip()]
+ out: Dict[str, str] = {}
+ for p in parts:
+ if "=" not in p:
+ continue
+ k, v = p.split("=", 1)
+ k = k.strip()
+ v = v.strip()
+ if not k or not v:
+ continue
+ # Limit key length to avoid garbage.
+ if len(k) > 8:
+ continue
+ out[k] = v
+ return out
+
+ def _map_speed_label(self, label: str) -> Optional[float]:
+ lab = (label or "").strip()
+ m = {
+ "很慢": 0.85,
+ "稍慢": 0.93,
+ "正常": 1.00,
+ "稍快": 1.07,
+ "很快": 1.15,
+ }
+ return m.get(lab)
+
+ def _map_pause_label(self, label: str) -> Optional[float]:
+ lab = (label or "").strip()
+ m = {
+ "很少": 0.6,
+ "自然": 1.0,
+ "稍多": 1.3,
+ "很多": 1.7,
+ }
+ return m.get(lab)
+
+ def _ensure_base_pause_cfg(self, pause_cfg: Dict[str, float]) -> Dict[str, float]:
+ # If caller didn't configure pauses (all zeros), apply a conservative base so "停顿" can take effect.
+ keys = ["pause_linebreak", "period_pause", "comma_pause", "question_pause", "hyphen_pause"]
+ if all(float(pause_cfg.get(k, 0.0) or 0.0) == 0.0 for k in keys):
+ return {
+ **pause_cfg,
+ "pause_linebreak": 0.18,
+ "period_pause": 0.22,
+ "comma_pause": 0.10,
+ "question_pause": 0.20,
+ "hyphen_pause": 0.06,
+ }
+ return pause_cfg
+
+ def _enrich_instruct_for_emotion(self, instruct: str, max_chars: int) -> str:
+ """
+ Add short performance cues for common emotions, keeping it single-line KV style.
+ This helps when the model under-reacts to simple labels like "愤怒".
+ """
+ s = (instruct or "").strip()
+ if not s:
+ return ""
+
+ fields = self._parse_instruct_fields(s)
+ emo = fields.get("情绪", "")
+ if not emo:
+ return s
+
+ # Only add if it doesn't already contain a "表现=" field.
+ if "表现" in fields:
+ return s
+
+ emo_norm = emo
+ cues = ""
+ if "愤怒" in emo_norm or "生气" in emo_norm:
+ cues = "声压高,咬字重,重音强,尾音下压"
+ elif "开心" in emo_norm or "高兴" in emo_norm:
+ cues = "笑意明显,轻快上扬,尾音明亮"
+ elif "悲伤" in emo_norm or "难过" in emo_norm:
+ cues = "气声略多,音量偏低,语尾下沉"
+ elif "温柔" in emo_norm:
+ cues = "音量轻,气声柔,语尾轻收"
+ elif "冷淡" in emo_norm or "冷静" in emo_norm:
+ cues = "平直克制,少起伏,干净收尾"
+
+ if not cues:
+ return s
+
+ extra = f";表现={cues}"
+ if max_chars and len(s) + len(extra) > max_chars:
+ # Trim cues to fit.
+ allow = max_chars - len(s) - len(";表现=")
+ if allow <= 0:
+ return s[:max_chars].rstrip()
+ cues = cues[:allow].rstrip(",, ")
+ extra = f";表现={cues}"
+ return (s + extra)[:max_chars].rstrip() if max_chars else (s + extra)
+
+ def _apply_instruct_controls(
+ self, instruct: str, speed: float, pause_cfg: Dict[str, float], max_chars: int
+ ) -> Tuple[str, float, Dict[str, float]]:
+ """
+ If instruct contains '语速'/'停顿', map them to real synthesis controls.
+ This makes auto_instruct meaningfully affect output even if the model is insensitive to labels.
+ """
+ s = (instruct or "").strip()
+ if not s:
+ return "", speed, pause_cfg
+
+ fields = self._parse_instruct_fields(s)
+ speed_label = fields.get("语速", "")
+ pause_label = fields.get("停顿", "")
+
+ out_speed = float(speed)
+ mapped_speed = self._map_speed_label(speed_label)
+ if mapped_speed is not None:
+ out_speed = mapped_speed
+
+ out_pause_cfg = dict(pause_cfg or {})
+ mapped_pause = self._map_pause_label(pause_label)
+ if mapped_pause is not None:
+ out_pause_cfg = self._ensure_base_pause_cfg(out_pause_cfg)
+ for k in ["pause_linebreak", "period_pause", "comma_pause", "question_pause", "hyphen_pause"]:
+ try:
+ out_pause_cfg[k] = float(out_pause_cfg.get(k, 0.0) or 0.0) * float(mapped_pause)
+ except Exception:
+ pass
+
+ # Add short performance cues (kept within max_chars).
+ s = self._enrich_instruct_for_emotion(s, max_chars=max_chars)
+ return s, out_speed, out_pause_cfg
+
+ async def _infer_instruct(
+ self,
+ text: str,
+ detected_lang: str,
+ chat_stream=None,
+ chat_id: Optional[str] = None,
+ style_name: str = "",
+ ) -> str:
+ """
+ Infer a short CustomVoice `instruct` string from the target text via MaiBot's LLM interface.
+ """
+ enabled = bool(self.get_config(ConfigKeys.COMFYUI_AUTO_INSTRUCT_ENABLED, False))
+ if not enabled:
+ return ""
+
+ max_chars = int(self.get_config(ConfigKeys.COMFYUI_AUTO_INSTRUCT_MAX_CHARS, 40) or 40)
+
+ # Default prompt: output ONE short instruct line only.
+ default_tpl = (
+ "你是配音导演。请根据要朗读的文本生成一行 TTS instruct。\\n"
+ "硬性要求:必须同时包含【情绪】【语速】【停顿】三项。可以额外补充 1-2 个表演提示(如 音量/重音/音高/表现)。\\n"
+ "只输出一行,不要解释,不要复述原文,不要引号/代码块。\\n"
+ "输出格式固定为:情绪=<...>;语速=<...>;停顿=<...>\\n"
+ "语速可选:很慢/稍慢/正常/稍快/很快。\\n"
+ "停顿可选:很少/自然/稍多/很多。\\n"
+ "长度<= {max_chars} 字。\\n"
+ "文本语言: {lang}\\n"
+ "待朗读文本: {text}\\n"
+ )
+ prompt_tpl = str(self.get_config(ConfigKeys.COMFYUI_AUTO_INSTRUCT_PROMPT, default_tpl) or "")
+ if not prompt_tpl.strip():
+ prompt_tpl = default_tpl
+
+ # Cache key should change if prompt/base_tone/max_chars changes.
+ base_raw = str(self.get_config(ConfigKeys.COMFYUI_AUTO_INSTRUCT_BASE_TONE, "") or "")
+ cfg_sig_src = f"{max_chars}\\n{prompt_tpl}\\n{base_raw}"
+ cfg_sig = hashlib.sha256(cfg_sig_src.encode("utf-8")).hexdigest()[:12]
+ text_sig = hashlib.sha256(text.encode("utf-8")).hexdigest()[:16]
+ cache_key = f"{cfg_sig}:{detected_lang}:{text_sig}"
+ cached = self._instruct_cache.get(cache_key)
+ if cached:
+ return cached
+
+ lang = detected_lang or "auto"
+ prompt = prompt_tpl.format(text=text.strip(), lang=lang, max_chars=max_chars)
+
+ try:
+ resp = await generator_api.generate_tts_instruct(
+ prompt=prompt,
+ request_type="tts_instruct",
+ )
+ instruct = self._clean_instruct(resp or "", max_chars=max_chars)
+ instruct = self._attach_base_tone(instruct, max_chars=max_chars)
+ if instruct:
+ self._instruct_cache[cache_key] = instruct
+ return instruct
+ except Exception as e:
+ logger.warning(f"{self.log_prefix} auto_instruct 失败(style={style_name}): {e}")
+ return ""
+
+ def validate_config(self) -> Tuple[bool, str]:
+ server = self.get_config(ConfigKeys.COMFYUI_SERVER, "http://127.0.0.1:8188")
+ if not server:
+ return False, "ComfyUI 未配置 server"
+
+ input_dir = self.get_config(
+ ConfigKeys.COMFYUI_INPUT_DIR,
+ "/Users/xenon/Downloads/seiun_tts/ComfyUI/ComfyUI/input",
+ )
+ if not input_dir:
+ return False, "ComfyUI 未配置 input_dir"
+
+ styles_raw = self.get_config(ConfigKeys.COMFYUI_STYLES, {})
+ styles = self._normalize_styles_config(styles_raw)
+ if not styles:
+ return False, "ComfyUI 后端未配置任何风格(至少需要配置 1 个 style)"
+
+ default_name = self.get_default_voice() or "default"
+ if default_name not in styles:
+ # Fallback to "default" if present.
+ if "default" in styles:
+ default_name = "default"
+ else:
+ return False, f"ComfyUI default_style='{default_name}' 不存在"
+
+ st = styles.get(default_name, {})
+ mode = (st.get("mode") or "voice_clone").strip()
+ if mode == "voice_clone":
+ if not st.get("refer_wav") or not st.get("prompt_text"):
+ return False, f"ComfyUI 风格 '{default_name}' 配置不完整(voice_clone 需要 refer_wav 和 prompt_text)"
+ elif mode == "custom_voice":
+ if not st.get("model_path") or not st.get("speaker"):
+ return False, f"ComfyUI 风格 '{default_name}' 配置不完整(custom_voice 需要 model_path 和 speaker)"
+ else:
+ return False, f"ComfyUI 风格 '{default_name}' mode 无效: {mode}"
+
+ return True, ""
+
+ def _ensure_ref_in_input(self, input_dir: str, refer_wav: str) -> str:
+ refer_wav = TTSFileManager.resolve_path(refer_wav)
+ if not os.path.exists(refer_wav):
+ raise FileNotFoundError(f"参考音频不存在: {refer_wav}")
+
+ st = os.stat(refer_wav)
+ cache_key = f"{os.path.abspath(refer_wav)}:{st.st_mtime_ns}:{st.st_size}"
+ if cache_key in self._ref_cache:
+ name = self._ref_cache[cache_key]
+ if os.path.exists(os.path.join(input_dir, name)):
+ return name
+
+ ext = os.path.splitext(refer_wav)[1] or ".wav"
+ h = hashlib.sha256(cache_key.encode("utf-8")).hexdigest()[:16]
+ name = f"maibot_ref_{h}{ext}"
+ dst = os.path.join(input_dir, name)
+
+ os.makedirs(input_dir, exist_ok=True)
+ if not os.path.exists(dst):
+ # Keep it simple: copy file bytes. LoadAudio can decode common formats (wav/mp3).
+ import shutil
+
+ shutil.copyfile(refer_wav, dst)
+
+ self._ref_cache[cache_key] = name
+ return name
+
+ def _build_prompt_voice_clone(
+ self,
+ ref_filename: str,
+ ref_text: str,
+ target_text: str,
+ language: str,
+ model_choice: str,
+ precision: str,
+ seed: int,
+ max_new_tokens: int,
+ top_p: float,
+ top_k: int,
+ temperature: float,
+ repetition_penalty: float,
+ audio_quality: str,
+ mlx_python: str,
+ mlx_cli: str,
+ pause_cfg: Dict[str, float],
+ ) -> Dict[str, Any]:
+ # Node IDs are arbitrary but stable in this prompt template.
+ # 1: LoadAudio -> outputs AUDIO
+ # 2: Pause config (FB_Qwen3TTSConfig) -> outputs TTS_CONFIG
+ # 3: MLX VoiceClone -> outputs AUDIO
+ # 4: SaveAudioMP3 -> outputs UI audio file info
+ filename_prefix = f"audio/maibot_comfyui_{int(time.time())}_{uuid.uuid4().hex[:8]}"
+ prompt: Dict[str, Any] = {
+ "1": {
+ "class_type": "LoadAudio",
+ "inputs": {
+ "audio": ref_filename,
+ },
+ },
+ "2": {
+ "class_type": "FB_Qwen3TTSConfig",
+ "inputs": {
+ "pause_linebreak": float(pause_cfg.get("pause_linebreak", 0.0)),
+ "period_pause": float(pause_cfg.get("period_pause", 0.0)),
+ "comma_pause": float(pause_cfg.get("comma_pause", 0.0)),
+ "question_pause": float(pause_cfg.get("question_pause", 0.0)),
+ "hyphen_pause": float(pause_cfg.get("hyphen_pause", 0.0)),
+ },
+ },
+ "3": {
+ "class_type": "MLX_Qwen3TTSVoiceClone",
+ "inputs": {
+ "target_text": target_text,
+ "model_choice": model_choice,
+ "device": "auto",
+ "precision": precision,
+ "language": language,
+ "ref_audio": ["1", 0],
+ "ref_text": ref_text,
+ "seed": int(seed),
+ "max_new_tokens": int(max_new_tokens),
+ "top_p": float(top_p),
+ "top_k": int(top_k),
+ "temperature": float(temperature),
+ "repetition_penalty": float(repetition_penalty),
+ "attention": "auto",
+ "unload_model_after_generate": False,
+ "config": ["2", 0],
+ "mlx_python": mlx_python,
+ "mlx_cli": mlx_cli,
+ },
+ },
+ "4": {
+ "class_type": "SaveAudioMP3",
+ "inputs": {
+ "audio": ["3", 0],
+ "filename_prefix": filename_prefix,
+ "quality": audio_quality,
+ },
+ },
+ }
+ return prompt
+
+ def _build_prompt_custom_voice(
+ self,
+ target_text: str,
+ speaker: str,
+ model_path: str,
+ instruct: str,
+ speed: float,
+ language: str,
+ seed: int,
+ max_new_tokens: int,
+ top_p: float,
+ top_k: int,
+ temperature: float,
+ repetition_penalty: float,
+ audio_quality: str,
+ mlx_python: str,
+ mlx_cli: str,
+ pause_cfg: Dict[str, float],
+ ) -> Dict[str, Any]:
+ # 2: Pause config (FB_Qwen3TTSConfig) -> outputs TTS_CONFIG
+ # 3: MLX CustomVoice -> outputs AUDIO
+ # 4: SaveAudioMP3 -> outputs UI audio file info
+ filename_prefix = f"audio/maibot_comfyui_{int(time.time())}_{uuid.uuid4().hex[:8]}"
+ prompt: Dict[str, Any] = {
+ "2": {
+ "class_type": "FB_Qwen3TTSConfig",
+ "inputs": {
+ "pause_linebreak": float(pause_cfg.get("pause_linebreak", 0.0)),
+ "period_pause": float(pause_cfg.get("period_pause", 0.0)),
+ "comma_pause": float(pause_cfg.get("comma_pause", 0.0)),
+ "question_pause": float(pause_cfg.get("question_pause", 0.0)),
+ "hyphen_pause": float(pause_cfg.get("hyphen_pause", 0.0)),
+ },
+ },
+ "3": {
+ "class_type": "MLX_Qwen3TTSCustomVoice",
+ "inputs": {
+ "text": target_text,
+ "speaker": speaker,
+ "model_path": model_path,
+ "instruct": instruct or "",
+ "speed": float(speed),
+ "language": language,
+ "seed": int(seed),
+ "max_new_tokens": int(max_new_tokens),
+ "top_p": float(top_p),
+ "top_k": int(top_k),
+ "temperature": float(temperature),
+ "repetition_penalty": float(repetition_penalty),
+ "config": ["2", 0],
+ "mlx_python": mlx_python,
+ "mlx_cli": mlx_cli,
+ },
+ },
+ "4": {
+ "class_type": "SaveAudioMP3",
+ "inputs": {
+ "audio": ["3", 0],
+ "filename_prefix": filename_prefix,
+ "quality": audio_quality,
+ },
+ },
+ }
+ return prompt
+
+ async def _queue_and_wait(
+ self, server: str, prompt: Dict[str, Any], timeout: int
+ ) -> Dict[str, Any]:
+ session_manager = await TTSSessionManager.get_instance()
+ prompt_id = str(uuid.uuid4())
+
+ post_url = f"{server.rstrip('/')}/prompt"
+ payload = {
+ "prompt": prompt,
+ "client_id": "maibot-tts-voice-plugin",
+ "prompt_id": prompt_id,
+ }
+
+ async with session_manager.post(
+ post_url, json=payload, backend_name=self.backend_name, timeout=timeout
+ ) as resp:
+ data = await resp.json(content_type=None)
+ if resp.status != 200:
+ raise RuntimeError(f"ComfyUI /prompt 失败: {resp.status} {str(data)[:200]}")
+ if "error" in data:
+ raise RuntimeError(f"ComfyUI /prompt 返回错误: {data['error']}")
+
+ # Poll history until prompt_id appears
+ hist_url = f"{server.rstrip('/')}/history/{prompt_id}"
+ deadline = time.time() + float(timeout)
+ while time.time() < deadline:
+ async with session_manager.get(
+ hist_url, backend_name=self.backend_name, timeout=timeout
+ ) as resp:
+ history = await resp.json(content_type=None)
+ if prompt_id in history:
+ return history[prompt_id]
+ await asyncio.sleep(0.35)
+
+ raise TimeoutError("等待 ComfyUI 生成超时")
+
+ async def _download_output_audio(self, server: str, history_item: Dict[str, Any], timeout: int) -> bytes:
+ outputs = history_item.get("outputs") or {}
+ node_out = outputs.get("4") or {}
+ audios = node_out.get("audio") or []
+ if not audios:
+ # Some failures show up only in status/messages.
+ status = history_item.get("status") or {}
+ raise RuntimeError(f"ComfyUI 未产出音频. status={status}")
+
+ a0 = audios[0]
+ filename = a0.get("filename")
+ subfolder = a0.get("subfolder", "")
+ folder_type = a0.get("type", "output")
+ if not filename:
+ raise RuntimeError(f"ComfyUI 音频输出结构异常: {a0}")
+
+ q = urlencode({"filename": filename, "subfolder": subfolder, "type": folder_type})
+ url = f"{server.rstrip('/')}/view?{q}"
+
+ session_manager = await TTSSessionManager.get_instance()
+ async with session_manager.get(url, backend_name=self.backend_name, timeout=timeout) as resp:
+ if resp.status != 200:
+ txt = await resp.text()
+ raise RuntimeError(f"ComfyUI /view 失败: {resp.status} {txt[:200]}")
+ return await resp.read()
+
+ async def execute(self, text: str, voice: Optional[str] = None, **kwargs) -> TTSResult:
+ is_valid, err = self.validate_config()
+ if not is_valid:
+ return TTSResult(False, err, backend_name=self.backend_name)
+
+ if not text or not text.strip():
+ return TTSResult(False, "待合成的文本为空", backend_name=self.backend_name)
+
+ server = self.get_config(ConfigKeys.COMFYUI_SERVER, "http://127.0.0.1:8188")
+ input_dir = self.get_config(
+ ConfigKeys.COMFYUI_INPUT_DIR,
+ "/Users/xenon/Downloads/seiun_tts/ComfyUI/ComfyUI/input",
+ )
+ timeout = int(self.get_config(ConfigKeys.COMFYUI_TIMEOUT, self.get_config(ConfigKeys.GENERAL_TIMEOUT, 60)))
+
+ audio_quality = self.get_config(ConfigKeys.COMFYUI_AUDIO_QUALITY, "128k")
+ mlx_python = self.get_config(
+ ConfigKeys.COMFYUI_MLX_PYTHON,
+ "/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/.venv/bin/python",
+ )
+ mlx_cli = self.get_config(
+ ConfigKeys.COMFYUI_MLX_CLI,
+ "/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/mlx_voice_clone_cli.py",
+ )
+
+ styles_raw = self.get_config(ConfigKeys.COMFYUI_STYLES, {})
+ styles = self._filter_styles_by_mode(self._normalize_styles_config(styles_raw))
+
+ style_name = (voice or self.get_default_voice() or "").strip() or "default"
+ if style_name not in styles:
+ # For split backends (voiceclone/customvoice), make "wrong style" errors explicit.
+ if (voice or "").strip() and self.allowed_modes:
+ return TTSResult(
+ False,
+ f"ComfyUI风格 '{style_name}' 不存在或不属于当前后端({self.backend_name})",
+ backend_name=self.backend_name,
+ )
+ # Fallback order: "default" -> first available style.
+ if "default" in styles:
+ style_name = "default"
+ elif styles:
+ style_name = sorted(styles.keys())[0]
+ else:
+ return TTSResult(
+ False,
+ f"ComfyUI 未配置任何风格({self.backend_name})",
+ backend_name=self.backend_name,
+ )
+ style = styles.get(style_name, {})
+
+ mode = (style.get("mode") or "voice_clone").strip()
+ if mode == "voice_clone":
+ refer_wav = style.get("refer_wav", "")
+ prompt_text = style.get("prompt_text", "")
+ if not refer_wav or not prompt_text:
+ return TTSResult(False, f"ComfyUI风格 '{style_name}' 配置不完整(voice_clone)", backend_name=self.backend_name)
+ elif mode == "custom_voice":
+ model_path = style.get("model_path", "")
+ speaker = style.get("speaker", "")
+ if not model_path or not speaker:
+ return TTSResult(False, f"ComfyUI风格 '{style_name}' 配置不完整(custom_voice)", backend_name=self.backend_name)
+ else:
+ return TTSResult(False, f"ComfyUI风格 '{style_name}' mode 无效: {mode}", backend_name=self.backend_name)
+
+ # Map language to the MLX node's language combo. Default to Auto.
+ detected = TTSTextUtils.detect_language(text)
+ language = style.get("language") or LANG_TO_DEMO.get(detected, "Auto")
+
+ # Sampling defaults match the MLX node defaults we exposed.
+ seed = int(style.get("seed", 0) or 0)
+ model_choice = str(style.get("model_choice", "1.7B") or "1.7B")
+ precision = str(style.get("precision", "bf16") or "bf16")
+ max_new_tokens = int(style.get("max_new_tokens", 2048) or 2048)
+ top_p = float(style.get("top_p", 0.8) or 0.8)
+ top_k = int(style.get("top_k", 20) or 20)
+ temperature = float(style.get("temperature", 1.0) or 1.0)
+ repetition_penalty = float(style.get("repetition_penalty", 1.05) or 1.05)
+
+ pause_cfg = {
+ "pause_linebreak": float(self.get_config(ConfigKeys.COMFYUI_PAUSE_LINEBREAK, 0.0)),
+ "period_pause": float(self.get_config(ConfigKeys.COMFYUI_PERIOD_PAUSE, 0.0)),
+ "comma_pause": float(self.get_config(ConfigKeys.COMFYUI_COMMA_PAUSE, 0.0)),
+ "question_pause": float(self.get_config(ConfigKeys.COMFYUI_QUESTION_PAUSE, 0.0)),
+ "hyphen_pause": float(self.get_config(ConfigKeys.COMFYUI_HYPHEN_PAUSE, 0.0)),
+ }
+ # Allow per-style override.
+ if isinstance(style.get("pause_cfg"), dict):
+ for k in pause_cfg.keys():
+ if k in style["pause_cfg"]:
+ try:
+ pause_cfg[k] = float(style["pause_cfg"][k])
+ except Exception:
+ pass
+
+ try:
+ if mode == "voice_clone":
+ ref_filename = self._ensure_ref_in_input(input_dir, style.get("refer_wav", ""))
+ prompt = self._build_prompt_voice_clone(
+ ref_filename=ref_filename,
+ ref_text=style.get("prompt_text", ""),
+ target_text=text,
+ language=language,
+ model_choice=model_choice,
+ precision=precision,
+ seed=seed,
+ max_new_tokens=max_new_tokens,
+ top_p=top_p,
+ top_k=top_k,
+ temperature=temperature,
+ repetition_penalty=repetition_penalty,
+ audio_quality=audio_quality,
+ mlx_python=mlx_python,
+ mlx_cli=mlx_cli,
+ pause_cfg=pause_cfg,
+ )
+ else:
+ # Allow per-style / automatic instruct inference.
+ instruct = str(style.get("instruct", "")).strip()
+ auto_style = bool(style.get("auto_instruct", False))
+ inferred = ""
+ if instruct == "__AUTO__" or (not instruct and auto_style):
+ chat_stream = kwargs.get("chat_stream")
+ chat_id = kwargs.get("chat_id")
+ inferred = await self._infer_instruct(
+ text=text,
+ detected_lang=detected,
+ chat_stream=chat_stream,
+ chat_id=chat_id,
+ style_name=style_name,
+ )
+ if inferred:
+ instruct = inferred
+
+ # If the instruct contains usable fields, map them to real controls.
+ max_chars = int(self.get_config(ConfigKeys.COMFYUI_AUTO_INSTRUCT_MAX_CHARS, 40) or 40)
+ instruct, mapped_speed, mapped_pause_cfg = self._apply_instruct_controls(
+ instruct=instruct,
+ speed=float(style.get("speed", 1.0) or 1.0),
+ pause_cfg=pause_cfg,
+ max_chars=max_chars,
+ )
+
+ prompt = self._build_prompt_custom_voice(
+ target_text=text,
+ speaker=str(style.get("speaker", "")).strip(),
+ model_path=str(style.get("model_path", "")).strip(),
+ instruct=instruct,
+ speed=mapped_speed,
+ language=language,
+ seed=seed,
+ max_new_tokens=max_new_tokens,
+ top_p=top_p,
+ top_k=top_k,
+ temperature=temperature,
+ repetition_penalty=repetition_penalty,
+ audio_quality=audio_quality,
+ mlx_python=mlx_python,
+ mlx_cli=mlx_cli,
+ pause_cfg=mapped_pause_cfg,
+ )
+
+ logger.info(f"{self.log_prefix} ComfyUI请求: text='{text[:50]}...', style={style_name}")
+ history_item = await self._queue_and_wait(server, prompt, timeout=timeout)
+ audio_bytes = await self._download_output_audio(server, history_item, timeout=timeout)
+
+ ok, msg = TTSFileManager.validate_audio_data(audio_bytes)
+ if not ok:
+ return TTSResult(False, f"ComfyUI 返回音频无效: {msg}", backend_name=self.backend_name)
+
+ return await self.send_audio(
+ audio_data=audio_bytes,
+ audio_format="mp3",
+ prefix="tts_comfyui",
+ voice_info=f"style: {style_name}",
+ )
+ except Exception as e:
+ return TTSResult(False, f"ComfyUI后端错误: {e}", backend_name=self.backend_name)
+
+
+class ComfyUIVoiceCloneBackend(ComfyUIBackend):
+ backend_name = "comfyui_voiceclone"
+ backend_description = "ComfyUI 工作流 API(MLX Qwen3-TTS VoiceClone 专用)"
+ allowed_modes = {"voice_clone"}
+
+ def get_default_voice(self) -> str:
+ v = self.get_config(ConfigKeys.COMFYUI_VOICECLONE_DEFAULT_STYLE, "") or ""
+ v = v.strip()
+ return v or super().get_default_voice()
+
+
+class ComfyUICustomVoiceBackend(ComfyUIBackend):
+ backend_name = "comfyui_customvoice"
+ backend_description = "ComfyUI 工作流 API(MLX Qwen3-TTS CustomVoice 专用)"
+ allowed_modes = {"custom_voice"}
+
+ def get_default_voice(self) -> str:
+ v = self.get_config(ConfigKeys.COMFYUI_CUSTOMVOICE_DEFAULT_STYLE, "") or ""
+ v = v.strip()
+ return v or super().get_default_voice()
diff --git a/plugins/tts_voice_plugin/backends/cosyvoice.py b/plugins/tts_voice_plugin/backends/cosyvoice.py
new file mode 100644
index 00000000..28199815
--- /dev/null
+++ b/plugins/tts_voice_plugin/backends/cosyvoice.py
@@ -0,0 +1,285 @@
+"""
+CosyVoice后端实现
+使用 ModelScope 的 Fun-CosyVoice3-0.5B Gradio API 进行语音合成
+"""
+
+import asyncio
+import os
+import shutil
+from typing import Optional, Tuple
+from .base import TTSBackendBase, TTSResult
+from ..utils.file import TTSFileManager
+from ..config_keys import ConfigKeys
+from src.common.logger import get_logger
+
+logger = get_logger("tts_cosyvoice")
+
+# CosyVoice指令映射表(方言、情感、语速等)
+COSYVOICE_INSTRUCT_MAP = {
+ # 方言
+ "广东话": "You are a helpful assistant. 请用广东话表达。<|endofprompt|>",
+ "东北话": "You are a helpful assistant. 请用东北话表达。<|endofprompt|>",
+ "甘肃话": "You are a helpful assistant. 请用甘肃话表达。<|endofprompt|>",
+ "贵州话": "You are a helpful assistant. 请用贵州话表达。<|endofprompt|>",
+ "河南话": "You are a helpful assistant. 请用河南话表达。<|endofprompt|>",
+ "湖北话": "You are a helpful assistant. 请用湖北话表达。<|endofprompt|>",
+ "湖南话": "You are a helpful assistant. 请用湖南话表达。<|endofprompt|>",
+ "江西话": "You are a helpful assistant. 请用江西话表达。<|endofprompt|>",
+ "闽南话": "You are a helpful assistant. 请用闽南话表达。<|endofprompt|>",
+ "宁夏话": "You are a helpful assistant. 请用宁夏话表达。<|endofprompt|>",
+ "山西话": "You are a helpful assistant. 请用山西话表达。<|endofprompt|>",
+ "陕西话": "You are a helpful assistant. 请用陕西话表达。<|endofprompt|>",
+ "山东话": "You are a helpful assistant. 请用山东话表达。<|endofprompt|>",
+ "上海话": "You are a helpful assistant. 请用上海话表达。<|endofprompt|>",
+ "四川话": "You are a helpful assistant. 请用四川话表达。<|endofprompt|>",
+ "天津话": "You are a helpful assistant. 请用天津话表达。<|endofprompt|>",
+ "云南话": "You are a helpful assistant. 请用云南话表达。<|endofprompt|>",
+
+ # 音量
+ "大声": "You are a helpful assistant. Please say a sentence as loudly as possible.<|endofprompt|>",
+ "小声": "You are a helpful assistant. Please say a sentence in a very soft voice.<|endofprompt|>",
+
+ # 语速
+ "慢速": "You are a helpful assistant. 请用尽可能慢地语速说一句话。<|endofprompt|>",
+ "快速": "You are a helpful assistant. 请用尽可能快地语速说一句话。<|endofprompt|>",
+
+ # 情感
+ "开心": "You are a helpful assistant. 请非常开心地说一句话。<|endofprompt|>",
+ "伤心": "You are a helpful assistant. 请非常伤心地说一句话。<|endofprompt|>",
+ "生气": "You are a helpful assistant. 请非常生气地说一句话。<|endofprompt|>",
+
+ # 特殊风格
+ "小猪佩奇": "You are a helpful assistant. 我想体验一下小猪佩奇风格,可以吗?<|endofprompt|>",
+ "机器人": "You are a helpful assistant. 你可以尝试用机器人的方式解答吗?<|endofprompt|>",
+}
+
+
+class CosyVoiceBackend(TTSBackendBase):
+ """
+ CosyVoice语音后端
+
+ 使用 ModelScope 的 Fun-CosyVoice3-0.5B Gradio API 进行语音合成
+ 支持3秒极速复刻、自然语言控制(方言、情感、语速等)
+ """
+
+ backend_name = "cosyvoice"
+ backend_description = "阿里云 CosyVoice3 API (ModelScope Gradio)"
+ support_private_chat = True
+ default_audio_format = "wav"
+
+ def get_default_voice(self) -> str:
+ """获取默认音色(CosyVoice 不需要预设音色)"""
+ return ""
+
+ def validate_config(self) -> Tuple[bool, str]:
+ """验证配置"""
+ gradio_url = self.get_config(ConfigKeys.COSYVOICE_GRADIO_URL, "")
+
+ if not gradio_url:
+ return False, "CosyVoice后端缺少必需的 gradio_url 配置"
+
+ return True, ""
+
+ def _resolve_instruct(self, emotion: Optional[str]) -> str:
+ """
+ 解析情感参数为指令文本
+
+ Args:
+ emotion: 情感/方言关键词
+
+ Returns:
+ 指令文本
+ """
+ if emotion and emotion in COSYVOICE_INSTRUCT_MAP:
+ return COSYVOICE_INSTRUCT_MAP[emotion]
+
+ # 返回默认指令(确保不为空)
+ default_instruct = self.get_config(
+ ConfigKeys.COSYVOICE_DEFAULT_INSTRUCT,
+ "You are a helpful assistant. 请用广东话表达。<|endofprompt|>"
+ )
+
+ # 如果配置为空,强制使用广东话
+ if not default_instruct or not default_instruct.strip():
+ default_instruct = "You are a helpful assistant. 请用广东话表达。<|endofprompt|>"
+
+ return default_instruct
+
+ async def execute(
+ self,
+ text: str,
+ voice: Optional[str] = None,
+ emotion: Optional[str] = None,
+ **kwargs
+ ) -> TTSResult:
+ """
+ 执行 CosyVoice 语音合成
+
+ Args:
+ text: 待转换的文本
+ voice: 音色(对于CosyVoice,这个参数用于指定参考音频路径)
+ emotion: 情感/方言/语速参数
+
+ Returns:
+ TTSResult
+ """
+ # 验证配置
+ is_valid, error_msg = self.validate_config()
+ if not is_valid:
+ return TTSResult(False, error_msg, backend_name=self.backend_name)
+
+ # 验证文本
+ if not text or not text.strip():
+ return TTSResult(False, "待合成的文本为空", backend_name=self.backend_name)
+
+ # 获取配置
+ gradio_url = self.get_config(ConfigKeys.COSYVOICE_GRADIO_URL, "")
+ mode_config = self.get_config(ConfigKeys.COSYVOICE_DEFAULT_MODE, "3s极速复刻")
+
+ # mode_checkbox_group 实际上是 Radio 组件,期望字符串而不是列表
+ # 处理配置可能返回字符串或列表的情况
+ if isinstance(mode_config, list):
+ mode_str = mode_config[0] if mode_config else "3s极速复刻"
+ else:
+ mode_str = mode_config if mode_config else "3s极速复刻"
+
+ timeout = self.get_config(ConfigKeys.COSYVOICE_TIMEOUT, 60)
+ reference_audio = self.get_config(ConfigKeys.COSYVOICE_REFERENCE_AUDIO, "")
+ prompt_text = self.get_config(ConfigKeys.COSYVOICE_PROMPT_TEXT, "")
+
+ # CosyVoice 的"自然语言控制"模式实际上需要参考音频和 prompt_text
+ # 如果没有配置,使用默认的参考音频
+ if not reference_audio or not os.path.exists(reference_audio):
+ plugin_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+ default_audio = os.path.join(plugin_dir, "test.wav")
+ if os.path.exists(default_audio):
+ reference_audio = default_audio
+ logger.debug(f"{self.log_prefix} 使用默认参考音频: {reference_audio}")
+
+ # 如果没有 prompt_text,使用默认文本
+ if not prompt_text:
+ prompt_text = "大家好,我是嘉然,今天我来为大家朗读。"
+ logger.debug(f"{self.log_prefix} 使用默认 prompt_text")
+
+ # voice 参数可以覆盖配置文件中的参考音频
+ if voice and os.path.exists(voice):
+ reference_audio = voice
+
+ # 解析指令文本
+ instruct_text = self._resolve_instruct(emotion)
+
+ logger.info(
+ f"{self.log_prefix} CosyVoice请求: text='{text[:50]}...' "
+ f"(共{len(text)}字符), mode={mode_str}, instruct={emotion or '默认'}"
+ )
+
+ try:
+ # 动态导入 gradio_client(避免全局依赖)
+ try:
+ from gradio_client import Client, handle_file
+ except ImportError:
+ logger.error(f"{self.log_prefix} gradio_client 未安装,请运行: pip install gradio_client")
+ return TTSResult(
+ False,
+ "gradio_client 未安装,请运行: pip install gradio_client",
+ backend_name=self.backend_name
+ )
+
+ # 创建 Gradio 客户端(设置超时)
+ try:
+ import httpx
+ httpx_kwargs = {"timeout": httpx.Timeout(timeout, read=timeout, write=timeout, connect=30.0)}
+ client = Client(gradio_url, httpx_kwargs=httpx_kwargs)
+ except Exception as e:
+ logger.warning(f"{self.log_prefix} 无法设置 httpx 超时,使用默认配置: {e}")
+ client = Client(gradio_url)
+
+ # 准备参数
+ logger.debug(f"{self.log_prefix} 准备参考音频: {reference_audio}")
+ prompt_wav_upload = handle_file(reference_audio) if reference_audio and os.path.exists(reference_audio) else None
+ logger.debug(f"{self.log_prefix} 参考音频准备完成")
+
+ # 调用 API
+ logger.info(f"{self.log_prefix} 调用 Gradio API: {gradio_url} (超时: {timeout}秒)")
+ logger.debug(f"{self.log_prefix} mode参数: {mode_str} (type: {type(mode_str).__name__})")
+ logger.debug(f"{self.log_prefix} prompt_text: {prompt_text[:50]}...")
+ logger.debug(f"{self.log_prefix} instruct_text: {instruct_text[:50]}...")
+
+ result = await asyncio.wait_for(
+ asyncio.to_thread(
+ client.predict,
+ tts_text=text,
+ mode_checkbox_group=mode_str,
+ prompt_text=prompt_text,
+ prompt_wav_upload=prompt_wav_upload,
+ prompt_wav_record=None,
+ instruct_text=instruct_text,
+ seed=0,
+ stream=False, # API 实际期望布尔值 False,虽然文档显示为 Literal['False']
+ api_name="/generate_audio"
+ ),
+ timeout=timeout
+ )
+
+ logger.info(f"{self.log_prefix} CosyVoice API 响应成功")
+
+ # result 是生成的音频文件路径
+ if not result or not os.path.exists(result):
+ return TTSResult(
+ False,
+ f"CosyVoice 生成失败,未返回有效文件: {result}",
+ backend_name=self.backend_name
+ )
+
+ # 读取音频数据
+ try:
+ with open(result, 'rb') as f:
+ audio_data = f.read()
+ except Exception as e:
+ logger.error(f"{self.log_prefix} 读取音频文件失败: {e}")
+ return TTSResult(
+ False,
+ f"读取音频文件失败: {e}",
+ backend_name=self.backend_name
+ )
+
+ # 验证音频数据
+ is_valid, error_msg = TTSFileManager.validate_audio_data(audio_data)
+ if not is_valid:
+ logger.warning(f"{self.log_prefix} CosyVoice音频数据验证失败: {error_msg}")
+ return TTSResult(
+ False,
+ f"CosyVoice语音{error_msg}",
+ backend_name=self.backend_name
+ )
+
+ logger.debug(
+ f"{self.log_prefix} CosyVoice音频数据验证通过 "
+ f"(大小: {len(audio_data)}字节)"
+ )
+
+ # 使用统一的发送方法
+ audio_format = self.get_config(ConfigKeys.COSYVOICE_AUDIO_FORMAT, "wav")
+ voice_info = f"模式: {mode_str}, 指令: {emotion or '默认'}"
+
+ return await self.send_audio(
+ audio_data=audio_data,
+ audio_format=audio_format,
+ prefix="tts_cosyvoice",
+ voice_info=voice_info
+ )
+
+ except asyncio.TimeoutError:
+ logger.error(f"{self.log_prefix} CosyVoice API 请求超时 (配置超时: {timeout}秒)")
+ return TTSResult(
+ False,
+ "CosyVoice API 调用超时",
+ backend_name=self.backend_name
+ )
+ except Exception as e:
+ logger.error(f"{self.log_prefix} CosyVoice 执行异常: {e}")
+ return TTSResult(
+ False,
+ f"CosyVoice 执行错误: {e}",
+ backend_name=self.backend_name
+ )
diff --git a/plugins/tts_voice_plugin/backends/doubao.py b/plugins/tts_voice_plugin/backends/doubao.py
new file mode 100644
index 00000000..4c566abf
--- /dev/null
+++ b/plugins/tts_voice_plugin/backends/doubao.py
@@ -0,0 +1,230 @@
+"""
+豆包语音后端实现
+使用字节跳动豆包语音 API 进行语音合成
+"""
+
+import asyncio
+import uuid
+from typing import Optional, List, Dict, Tuple
+from .base import TTSBackendBase, TTSResult
+from .doubao_stream_parser import DoubaoStreamParser
+from ..utils.file import TTSFileManager
+from ..utils.session import TTSSessionManager
+from ..config_keys import ConfigKeys
+from src.common.logger import get_logger
+
+logger = get_logger("tts_doubao")
+
+# 豆包语音情感映射表(用于自动生成context_texts)
+DOUBAO_EMOTION_MAP = {
+ # 积极情绪
+ "开心": "你的语气再欢乐一点",
+ "兴奋": "用特别兴奋激动的语气说话",
+ "温柔": "用温柔体贴的语气说话",
+ "骄傲": "用骄傲的语气说话",
+ "自信": "用自信坚定的语气说话",
+
+ # 消极情绪
+ "生气": "你得跟我互怼!就是跟我用吵架的语气对话",
+ "愤怒": "用愤怒的语气说话",
+ "伤心": "用特别特别痛心的语气说话",
+ "失望": "用失望沮丧的语气说话",
+ "委屈": "用委屈的语气说话",
+
+ # 中性情绪
+ "平静": "用平静淡定的语气说话",
+ "严肃": "用严肃认真的语气说话",
+ "疑惑": "用疑惑不解的语气说话",
+
+ # 语速调整
+ "慢速": "说慢一点",
+ "快速": "说快一点",
+
+ # 音量调整
+ "小声": "你嗓门再小点",
+ "大声": "大声一点",
+}
+
+
+class DoubaoBackend(TTSBackendBase):
+ """
+ 豆包语音后端
+
+ 使用字节跳动豆包语音 API 进行高质量语音合成
+ 支持预置音色和复刻音色
+ """
+
+ backend_name = "doubao"
+ backend_description = "字节跳动豆包语音API"
+ support_private_chat = True
+ default_audio_format = "mp3"
+
+ def get_default_voice(self) -> str:
+ """获取默认音色"""
+ return self.get_config(ConfigKeys.DOUBAO_DEFAULT_VOICE, "zh_female_shuangkuaisisi_moon_bigtts")
+
+ def validate_config(self) -> Tuple[bool, str]:
+ """验证配置"""
+ app_id = self.get_config(ConfigKeys.DOUBAO_APP_ID, "")
+ access_key = self.get_config(ConfigKeys.DOUBAO_ACCESS_KEY, "")
+ resource_id = self.get_config(ConfigKeys.DOUBAO_RESOURCE_ID, "")
+
+ if not app_id or not access_key or not resource_id:
+ return False, "豆包语音后端缺少必需的认证配置(app_id/access_key/resource_id)"
+
+ return True, ""
+
+ def _resolve_emotion(self, emotion: Optional[str]) -> Optional[List[str]]:
+ """
+ 解析情感参数为 context_texts
+
+ Args:
+ emotion: 情感关键词
+
+ Returns:
+ context_texts 列表或 None
+ """
+ if emotion and emotion in DOUBAO_EMOTION_MAP:
+ return [DOUBAO_EMOTION_MAP[emotion]]
+ return None
+
+ async def execute(
+ self,
+ text: str,
+ voice: Optional[str] = None,
+ emotion: Optional[str] = None,
+ **kwargs
+ ) -> TTSResult:
+ """
+ 执行豆包语音合成
+
+ Args:
+ text: 待转换的文本
+ voice: 音色ID
+ emotion: 情感/语气参数
+
+ Returns:
+ TTSResult
+ """
+ # 验证配置
+ is_valid, error_msg = self.validate_config()
+ if not is_valid:
+ return TTSResult(False, error_msg, backend_name=self.backend_name)
+
+ # 验证文本
+ if not text or not text.strip():
+ return TTSResult(False, "待合成的文本为空", backend_name=self.backend_name)
+
+ # 获取配置
+ api_url = self.get_config(ConfigKeys.DOUBAO_API_URL, "https://openspeech.bytedance.com/api/v3/tts/unidirectional")
+ app_id = self.get_config(ConfigKeys.DOUBAO_APP_ID, "")
+ access_key = self.get_config(ConfigKeys.DOUBAO_ACCESS_KEY, "")
+ resource_id = self.get_config(ConfigKeys.DOUBAO_RESOURCE_ID, "")
+ timeout = self.get_config(ConfigKeys.DOUBAO_TIMEOUT, 30)
+
+ if not voice:
+ voice = self.get_default_voice()
+
+ # 构建请求头
+ headers = {
+ "Content-Type": "application/json",
+ "X-Api-App-Id": app_id,
+ "X-Api-Access-Key": access_key,
+ "X-Api-Resource-Id": resource_id,
+ "X-Api-Request-Id": str(uuid.uuid4()),
+ "Accept-Encoding": "gzip, deflate"
+ }
+
+ # 构建请求体
+ request_data: Dict[str, any] = {
+ "req_params": {
+ "text": text,
+ "speaker": voice,
+ "audio_params": {
+ "format": self.get_config(ConfigKeys.DOUBAO_AUDIO_FORMAT, "mp3"),
+ "sample_rate": self.get_config(ConfigKeys.DOUBAO_SAMPLE_RATE, 24000),
+ "bitrate": self.get_config(ConfigKeys.DOUBAO_BITRATE, 128000)
+ }
+ }
+ }
+
+ # 添加可选参数
+ speed = self.get_config(ConfigKeys.DOUBAO_SPEED, None)
+ if speed is not None:
+ request_data["req_params"]["speed"] = speed
+
+ volume = self.get_config(ConfigKeys.DOUBAO_VOLUME, None)
+ if volume is not None:
+ request_data["req_params"]["volume"] = volume
+
+ # 处理 context_texts
+ context_texts: Optional[List[str]] = None
+
+ # 优先使用传入的emotion参数
+ if emotion:
+ context_texts = self._resolve_emotion(emotion)
+ if context_texts:
+ logger.info(f"{self.log_prefix} 使用emotion参数: {emotion} -> {context_texts[0]}")
+
+ # 否则使用配置文件的默认值
+ if not context_texts:
+ context_texts = self.get_config(ConfigKeys.DOUBAO_CONTEXT_TEXTS, None)
+
+ if context_texts:
+ request_data["req_params"]["context_texts"] = context_texts
+
+ logger.info(f"{self.log_prefix} 豆包语音请求: text='{text[:50]}...' (共{len(text)}字符), voice={voice}")
+
+ try:
+ session_manager = await TTSSessionManager.get_instance()
+ async with session_manager.post(
+ api_url,
+ json=request_data,
+ headers=headers,
+ backend_name="doubao",
+ timeout=timeout
+ ) as response:
+ logger.info(f"{self.log_prefix} 豆包API响应状态码: {response.status}")
+
+ if response.status == 200:
+ # 使用新的流式响应解析器
+ audio_data, error_msg = await DoubaoStreamParser.parse_response(
+ response,
+ log_prefix=self.log_prefix
+ )
+
+ if error_msg:
+ logger.error(f"{self.log_prefix} 豆包语音解析失败: {error_msg}")
+ return TTSResult(False, error_msg, backend_name=self.backend_name)
+
+ # 验证音频数据
+ is_valid, error_msg = TTSFileManager.validate_audio_data(audio_data)
+ if not is_valid:
+ logger.warning(f"{self.log_prefix} 豆包音频数据验证失败: {error_msg}")
+ return TTSResult(False, f"豆包语音{error_msg}", backend_name=self.backend_name)
+
+ logger.debug(f"{self.log_prefix} 豆包音频数据验证通过 (大小: {len(audio_data)}字节)")
+
+ # 使用统一的发送方法
+ audio_format = self.get_config(ConfigKeys.DOUBAO_AUDIO_FORMAT, "mp3")
+ return await self.send_audio(
+ audio_data=audio_data,
+ audio_format=audio_format,
+ prefix="tts_doubao",
+ voice_info=f"音色: {voice}"
+ )
+ else:
+ error_text = await response.text()
+ logger.error(f"{self.log_prefix} 豆包API请求失败[{response.status}]: {error_text[:200]}")
+ return TTSResult(
+ False,
+ f"豆包语音API调用失败: {response.status} - {error_text[:100]}",
+ backend_name=self.backend_name
+ )
+
+ except asyncio.TimeoutError:
+ logger.error(f"{self.log_prefix} 豆包API请求超时 (配置超时: {timeout}秒)")
+ return TTSResult(False, "豆包语音API调用超时", backend_name=self.backend_name)
+ except Exception as e:
+ logger.error(f"{self.log_prefix} 豆包语音执行异常: {e}")
+ return TTSResult(False, f"豆包语音执行错误: {e}", backend_name=self.backend_name)
diff --git a/plugins/tts_voice_plugin/backends/doubao_stream_parser.py b/plugins/tts_voice_plugin/backends/doubao_stream_parser.py
new file mode 100644
index 00000000..a3f61925
--- /dev/null
+++ b/plugins/tts_voice_plugin/backends/doubao_stream_parser.py
@@ -0,0 +1,432 @@
+"""
+豆包语音流式响应解析器
+基于官方示例实现,确保兼容性和正确性
+
+官方API说明:
+- code=0: 继续处理,可能包含 "data"(音频)或 "sentence"(文本)
+- code=20000000: 结束标志,可能包含 "usage"(用量统计)
+- code>0: 错误响应
+"""
+
+import json
+import base64
+from typing import Tuple, Optional, List
+from src.common.logger import get_logger
+
+logger = get_logger("doubao_stream_parser")
+
+
+class DoubaoStreamParser:
+ """
+ 豆包语音流式响应解析器
+
+ 基于官方API实现,忠实还原官方示例逻辑。
+ 处理流程:
+ 1. 逐行读取 JSON 响应
+ 2. 检查状态码:code=0(继续), code=20000000(结束), code>0(错误)
+ 3. 提取音频数据(code=0 且有 "data" 字段)
+ 4. 记录日志(code=0 且有 "sentence" 字段)
+ """
+
+ def __init__(self, log_prefix: str = "[DoubaoParser]"):
+ """
+ 初始化解析器
+
+ Args:
+ log_prefix: 日志前缀
+ """
+ self.log_prefix = log_prefix
+ self._audio_chunks: List[bytes] = []
+ self._buffer: bytes = b''
+ self._line_count: int = 0
+ self._total_bytes: int = 0
+ self._error_message: Optional[str] = None
+ self._finished: bool = False # 是否收到结束信号
+ self._usage_info: Optional[dict] = None
+
+ def _decode_audio_from_base64(self, audio_base64: str) -> Optional[bytes]:
+ """
+ 从 Base64 字符串解码音频数据
+
+ 官方示例中直接使用 base64.b64decode(data["data"]),
+ 但我们添加了额外的容错和验证。
+
+ Args:
+ audio_base64: Base64 编码的音频数据
+
+ Returns:
+ 解码后的音频字节数据或 None
+ """
+ if not audio_base64:
+ return None
+
+ try:
+ # 官方示例直接调用 base64.b64decode()
+ # 这里添加容错处理:补充填充符(如果需要)
+ padding_needed = len(audio_base64) % 4
+ if padding_needed:
+ audio_base64 += '=' * (4 - padding_needed)
+ logger.debug(
+ f"{self.log_prefix} Base64填充已应用 "
+ f"(原长: {len(audio_base64) - (4 - padding_needed)}, 新长: {len(audio_base64)})"
+ )
+
+ audio_bytes = base64.b64decode(audio_base64)
+
+ if not audio_bytes:
+ logger.warning(f"{self.log_prefix} Base64解码结果为空")
+ return None
+
+ logger.debug(
+ f"{self.log_prefix} 音频块解码成功 - 大小: {len(audio_bytes)}字节"
+ )
+ return audio_bytes
+
+ except Exception as e:
+ logger.error(
+ f"{self.log_prefix} Base64解码失败: {e} "
+ f"(Base64长度: {len(audio_base64)})"
+ )
+ return None
+
+ def _process_json_line(self, line_str: str) -> Optional[str]:
+ """
+ 处理单行 JSON 数据
+
+ 严格按照官方示例逻辑:
+ 1. 检查 code 字段
+ 2. code=0 且有 data → 提取音频
+ 3. code=0 且有 sentence → 记录文本(可选)
+ 4. code=20000000 → 收到结束信号
+ 5. code>0 → 错误
+
+ Args:
+ line_str: JSON 字符串
+
+ Returns:
+ 如果收到结束信号,返回 "END";如果发生错误,返回错误信息;否则返回 None
+ """
+ try:
+ json_obj = json.loads(line_str)
+ except json.JSONDecodeError as e:
+ logger.debug(f"{self.log_prefix} JSON解析失败: {e}")
+ return None
+ except Exception as e:
+ logger.warning(f"{self.log_prefix} JSON处理异常: {e}")
+ return None
+
+ if not isinstance(json_obj, dict):
+ logger.debug(
+ f"{self.log_prefix} 收到非字典JSON对象: {type(json_obj).__name__}"
+ )
+ return None
+
+ code = json_obj.get("code", -1)
+
+ # ✅ 官方逻辑:处理 code=0 的数据帧
+ if code == 0:
+ # 检查是否有音频数据
+ if "data" in json_obj and json_obj["data"]:
+ chunk_audio = self._decode_audio_from_base64(json_obj["data"])
+ if chunk_audio:
+ self._audio_chunks.append(chunk_audio)
+ logger.debug(
+ f"{self.log_prefix} 音频块#{len(self._audio_chunks)} 已接收 "
+ f"(大小: {len(chunk_audio)}字节)"
+ )
+
+ # 检查是否有文本/句子信息(可选)
+ if "sentence" in json_obj and json_obj["sentence"]:
+ sentence_data = json_obj.get("sentence", {})
+ logger.debug(
+ f"{self.log_prefix} 收到句子数据: {sentence_data}"
+ )
+
+ return None # 继续处理
+
+ # ✅ 官方逻辑:处理 code=20000000 的结束帧
+ elif code == 20000000:
+ logger.info(f"{self.log_prefix} 收到流结束信号 (code=20000000)")
+
+ # 记录用量信息(如果有)
+ if "usage" in json_obj:
+ self._usage_info = json_obj["usage"]
+ logger.info(
+ f"{self.log_prefix} 豆包用量信息: {self._usage_info}"
+ )
+
+ self._finished = True
+ return "END" # 表示流已结束
+
+ # ✅ 官方逻辑:错误处理
+ elif code and code > 0:
+ error_msg = json_obj.get("message", f"未知错误 (code={code})")
+ logger.error(
+ f"{self.log_prefix} 豆包语音API返回错误 "
+ f"(code={code}): {error_msg}"
+ )
+ self._error_message = error_msg
+ return error_msg # 返回错误信息
+
+ # 未知状态码
+ else:
+ logger.debug(
+ f"{self.log_prefix} 收到未知状态码: code={code}"
+ )
+ return None
+
+ def _find_data_chunk_offset(self, header: bytes) -> int:
+ """
+ 在 WAV header 中查找 'data' 块的位置
+
+ 豆包返回的 WAV 可能包含额外的元数据块(如 LIST/INFO),
+ 导致 'data' 块不在标准的 44 字节位置。
+
+ Args:
+ header: WAV 文件头部数据
+
+ Returns:
+ data 块数据开始的位置(即 'data' + 4字节大小之后)
+ """
+ pos = 12 # 跳过 RIFF(4) + size(4) + WAVE(4)
+
+ while pos < len(header) - 8:
+ chunk_id = header[pos:pos+4]
+ chunk_size = int.from_bytes(header[pos+4:pos+8], 'little')
+
+ if chunk_id == b'data':
+ return pos + 8 # 返回音频数据开始位置
+
+ # 移动到下一个块
+ pos += 8 + chunk_size
+ # WAV 块需要对齐到偶数字节
+ if chunk_size % 2 == 1:
+ pos += 1
+
+ # 未找到 data 块,返回默认值
+ return 44
+
+ def _merge_audio_chunks(self, chunks: List[bytes]) -> bytes:
+ """
+ 合并音频块,处理 WAV 格式的流式响应
+
+ 豆包流式 WAV 响应特点:
+ 1. 第一个块包含完整 header(可能 > 44 字节,含 LIST/INFO 元数据)
+ 2. header 中的大小字段是 0xFFFFFFFF(流式占位符)
+ 3. 后续块是纯音频数据(无 header)
+ 4. 需要在合并后修正大小字段
+
+ Args:
+ chunks: 音频数据块列表
+
+ Returns:
+ 合并后的有效 WAV 文件
+ """
+ if not chunks:
+ return b''
+
+ first_chunk = chunks[0]
+
+ # 检查是否是 WAV 格式(RIFF header)
+ if len(first_chunk) < 44 or first_chunk[:4] != b'RIFF':
+ # 不是 WAV 格式(如 MP3),直接拼接
+ return b''.join(chunks)
+
+ # 查找 data 块的实际位置
+ data_offset = self._find_data_chunk_offset(first_chunk)
+ logger.debug(f"{self.log_prefix} WAV data 块偏移: {data_offset} 字节")
+
+ # 提取 header 和第一块的音频数据
+ header = bytearray(first_chunk[:data_offset])
+ data_parts = [first_chunk[data_offset:]]
+ skipped_headers = 0
+
+ # 处理后续块
+ for chunk in chunks[1:]:
+ if len(chunk) > 44 and chunk[:4] == b'RIFF':
+ # 后续块也有 RIFF header,需要跳过
+ chunk_data_offset = self._find_data_chunk_offset(chunk)
+ data_parts.append(chunk[chunk_data_offset:])
+ skipped_headers += 1
+ else:
+ # 纯音频数据
+ data_parts.append(chunk)
+
+ # 合并所有音频数据
+ audio_data = b''.join(data_parts)
+ audio_size = len(audio_data)
+
+ # 修正 WAV header 中的大小字段
+ # 字节 4-7: 文件总大小 - 8 = (header_size - 8) + audio_size
+ file_size = len(header) - 8 + audio_size
+ header[4:8] = file_size.to_bytes(4, 'little')
+
+ # 修正 data 块的大小字段(位于 data_offset - 4 处)
+ header[data_offset-4:data_offset] = audio_size.to_bytes(4, 'little')
+
+ if skipped_headers > 0 or audio_size > 0:
+ logger.info(
+ f"{self.log_prefix} WAV 流式合并完成: "
+ f"header={len(header)}字节, 音频={audio_size}字节, "
+ f"跳过重复header={skipped_headers}"
+ )
+
+ return bytes(header) + audio_data
+
+ def feed_chunk(self, chunk: bytes) -> Optional[str]:
+ """
+ 输入一块数据
+
+ Args:
+ chunk: 网络数据块
+
+ Returns:
+ 如果遇到错误或结束,返回相应信息;否则返回 None
+ """
+ if not chunk:
+ return None
+
+ self._buffer += chunk
+ self._total_bytes += len(chunk)
+
+ # 按行处理(官方示例使用 iter_lines)
+ while b'\n' in self._buffer:
+ line_bytes, self._buffer = self._buffer.split(b'\n', 1)
+
+ # 尝试解码行数据
+ try:
+ line_str = line_bytes.decode('utf-8', errors='replace').strip()
+ except Exception as e:
+ logger.warning(
+ f"{self.log_prefix} 行解码失败: {e}, 跳过该行"
+ )
+ self._line_count += 1
+ continue
+
+ if not line_str:
+ continue
+
+ self._line_count += 1
+
+ # 处理该行
+ result = self._process_json_line(line_str)
+
+ # 如果收到结束信号或错误,立即返回
+ if result == "END":
+ return None # 正常结束
+ elif result: # 返回的是错误信息
+ return result
+
+ return None
+
+ def finalize(self) -> Tuple[Optional[bytes], Optional[str]]:
+ """
+ 完成解析,处理剩余数据
+
+ Returns:
+ (audio_data, error_message)
+ - audio_data: 合并后的音频数据(成功时)
+ - error_message: 错误信息(失败时)
+ """
+ # 处理剩余的 buffer 中的最后一行
+ if self._buffer.strip():
+ try:
+ line_str = self._buffer.decode('utf-8', errors='replace').strip()
+ if line_str:
+ logger.debug(
+ f"{self.log_prefix} 处理最后的buffer数据 "
+ f"(长度: {len(line_str)}字符)"
+ )
+ result = self._process_json_line(line_str)
+ if result and result != "END":
+ # 最后的 buffer 包含错误
+ self._error_message = result
+ except Exception as e:
+ logger.warning(
+ f"{self.log_prefix} 最后buffer解析异常: {e}"
+ )
+
+ logger.info(
+ f"{self.log_prefix} 豆包流解析完成 - "
+ f"处理行数: {self._line_count}, "
+ f"音频块数: {len(self._audio_chunks)}, "
+ f"接收字节数: {self._total_bytes}, "
+ f"正常结束: {self._finished}"
+ )
+
+ # 检查是否有错误
+ if self._error_message:
+ logger.error(
+ f"{self.log_prefix} 豆包API返回错误: {self._error_message}"
+ )
+ return None, f"豆包语音API错误: {self._error_message}"
+
+ # 检查是否有音频数据
+ if not self._audio_chunks:
+ if self._total_bytes == 0:
+ logger.warning(
+ f"{self.log_prefix} 豆包API未返回任何数据"
+ )
+ return None, "未收到任何响应数据"
+
+ logger.warning(
+ f"{self.log_prefix} 收到 {self._total_bytes} 字节数据但无音频块"
+ )
+ return None, "豆包语音未返回任何音频数据"
+
+ # ✅ 额外的数据完整性检查
+ # 过滤掉过小的块(可能是损坏或无效的)
+ min_chunk_size = 50 # 最小块大小
+ valid_chunks = [
+ chunk for chunk in self._audio_chunks
+ if len(chunk) >= min_chunk_size
+ ]
+
+ if not valid_chunks:
+ logger.error(
+ f"{self.log_prefix} 所有音频块都太小 (可能是损坏的数据)"
+ )
+ logger.debug(
+ f"{self.log_prefix} 块大小分布: {[len(c) for c in self._audio_chunks]}"
+ )
+ return None, "音频数据不完整或已损坏"
+
+ # 合并所有有效的音频数据(处理 WAV 多 header 问题)
+ merged_audio = self._merge_audio_chunks(valid_chunks)
+
+ logger.info(
+ f"{self.log_prefix} 音频合并完成 - "
+ f"有效块数: {len(valid_chunks)}/{len(self._audio_chunks)}, "
+ f"总大小: {len(merged_audio)}字节"
+ )
+
+ return merged_audio, None
+
+ @classmethod
+ async def parse_response(
+ cls,
+ response,
+ log_prefix: str = "[DoubaoParser]"
+ ) -> Tuple[Optional[bytes], Optional[str]]:
+ """
+ 解析豆包 API 的流式响应
+
+ Args:
+ response: aiohttp 响应对象
+ log_prefix: 日志前缀
+
+ Returns:
+ (audio_data, error_message)
+ """
+ parser = cls(log_prefix)
+
+ # 逐块读取响应流
+ async for chunk in response.content.iter_any():
+ result = parser.feed_chunk(chunk)
+
+ # 如果遇到错误,立即返回
+ if result and result != "END":
+ return None, result
+
+ # 完成解析,处理剩余数据
+ return parser.finalize()
diff --git a/plugins/tts_voice_plugin/backends/gpt_sovits.py b/plugins/tts_voice_plugin/backends/gpt_sovits.py
new file mode 100644
index 00000000..126851ff
--- /dev/null
+++ b/plugins/tts_voice_plugin/backends/gpt_sovits.py
@@ -0,0 +1,326 @@
+"""
+GPT-SoVITS 后端实现
+使用本地 GPT-SoVITS 服务进行语音合成
+"""
+
+import asyncio
+from typing import Optional, Dict, Any, Tuple, ClassVar
+from .base import TTSBackendBase, TTSResult
+from ..utils.text import TTSTextUtils
+from ..utils.file import TTSFileManager
+from ..utils.session import TTSSessionManager
+from ..config_keys import ConfigKeys
+from src.common.logger import get_logger
+
+logger = get_logger("tts_gpt_sovits")
+
+
+class GPTSoVITSBackend(TTSBackendBase):
+ """
+ GPT-SoVITS 后端
+
+ 使用本地 GPT-SoVITS 服务进行高度定制化的语音合成
+ 支持动态切换 GPT 和 SoVITS 模型权重
+ """
+
+ backend_name = "gpt_sovits"
+ backend_description = "本地GPT-SoVITS服务"
+ support_private_chat = True
+ default_audio_format = "mp3"
+
+ # 类变量:记录当前加载的模型路径,避免重复切换
+ _current_gpt_weights: ClassVar[Optional[str]] = None
+ _current_sovits_weights: ClassVar[Optional[str]] = None
+
+ def get_default_voice(self) -> str:
+ """获取默认风格"""
+ return "default"
+
+ async def _switch_model(
+ self,
+ server: str,
+ gpt_weights: Optional[str],
+ sovits_weights: Optional[str],
+ timeout: int
+ ) -> Tuple[bool, str]:
+ """
+ 切换 GPT-SoVITS 模型权重
+
+ Args:
+ server: 服务器地址
+ gpt_weights: GPT 模型权重路径
+ sovits_weights: SoVITS 模型权重路径
+ timeout: 超时时间
+
+ Returns:
+ (success, error_message)
+ """
+ session_manager = await TTSSessionManager.get_instance()
+
+ async def _set_model_v1() -> Tuple[bool, str]:
+ # 兼容旧版 api.py: 仅支持 /set_model 同时切换
+ if not gpt_weights or not sovits_weights:
+ return False, "当前GPT-SoVITS服务不支持单独切换模型(请同时配置GPT与SoVITS权重)"
+ set_model_url = (
+ f"{server.rstrip('/')}/set_model?"
+ f"gpt_model_path={gpt_weights}&sovits_model_path={sovits_weights}"
+ )
+ logger.info(f"{self.log_prefix} 切换模型(兼容模式): {gpt_weights} | {sovits_weights}")
+ try:
+ async with session_manager.get(
+ set_model_url,
+ backend_name="gpt_sovits",
+ timeout=timeout
+ ) as response:
+ if response.status == 200:
+ GPTSoVITSBackend._current_gpt_weights = gpt_weights
+ GPTSoVITSBackend._current_sovits_weights = sovits_weights
+ logger.info(f"{self.log_prefix} 模型切换成功(兼容模式)")
+ return True, ""
+ error_text = await response.text()
+ return False, f"模型切换失败: {error_text}"
+ except Exception as e:
+ return False, f"模型切换异常: {e}"
+
+ # 切换 GPT 权重
+ if gpt_weights and gpt_weights != GPTSoVITSBackend._current_gpt_weights:
+ gpt_url = f"{server.rstrip('/')}/set_gpt_weights?weights_path={gpt_weights}"
+ logger.info(f"{self.log_prefix} 切换GPT模型: {gpt_weights}")
+
+ try:
+ async with session_manager.get(
+ gpt_url,
+ backend_name="gpt_sovits",
+ timeout=timeout
+ ) as response:
+ if response.status == 200:
+ GPTSoVITSBackend._current_gpt_weights = gpt_weights
+ logger.info(f"{self.log_prefix} GPT模型切换成功")
+ elif response.status == 404:
+ # 旧版服务没有 /set_gpt_weights
+ return await _set_model_v1()
+ else:
+ error_text = await response.text()
+ return False, f"GPT模型切换失败: {error_text}"
+ except Exception as e:
+ return False, f"GPT模型切换异常: {e}"
+
+ # 切换 SoVITS 权重
+ if sovits_weights and sovits_weights != GPTSoVITSBackend._current_sovits_weights:
+ sovits_url = f"{server.rstrip('/')}/set_sovits_weights?weights_path={sovits_weights}"
+ logger.info(f"{self.log_prefix} 切换SoVITS模型: {sovits_weights}")
+
+ try:
+ async with session_manager.get(
+ sovits_url,
+ backend_name="gpt_sovits",
+ timeout=timeout
+ ) as response:
+ if response.status == 200:
+ GPTSoVITSBackend._current_sovits_weights = sovits_weights
+ logger.info(f"{self.log_prefix} SoVITS模型切换成功")
+ elif response.status == 404:
+ # 旧版服务没有 /set_sovits_weights
+ return await _set_model_v1()
+ else:
+ error_text = await response.text()
+ return False, f"SoVITS模型切换失败: {error_text}"
+ except Exception as e:
+ return False, f"SoVITS模型切换异常: {e}"
+
+ return True, ""
+
+ def _normalize_styles_config(self, styles_config: Any) -> Dict[str, Any]:
+ """
+ 规范化风格配置格式
+
+ 支持两种格式:
+ 1. 旧格式(字典): {"default": {...}, "happy": {...}}
+ 2. 新格式(数组): [{"name": "default", ...}, {"name": "happy", ...}]
+
+ 统一转换为字典格式供内部使用
+ """
+ # 如果是字典格式(旧格式),直接返回
+ if isinstance(styles_config, dict):
+ return styles_config
+
+ # 如果是数组格式(新格式),转换为字典
+ if isinstance(styles_config, list):
+ result = {}
+ for style in styles_config:
+ if isinstance(style, dict) and "name" in style:
+ style_name = style["name"]
+ # 复制配置,移除 name 字段
+ style_data = {k: v for k, v in style.items() if k != "name"}
+ result[style_name] = style_data
+ return result
+
+ # 其他情况返回空字典
+ return {}
+
+ def validate_config(self) -> Tuple[bool, str]:
+ """验证配置"""
+ styles_raw = self.get_config(ConfigKeys.GPT_SOVITS_STYLES, {})
+ styles = self._normalize_styles_config(styles_raw)
+
+ if not styles or "default" not in styles:
+ return False, "GPT-SoVITS未配置任何语音风格"
+
+ default_style = styles.get("default", {})
+ if not default_style.get("refer_wav") or not default_style.get("prompt_text"):
+ return False, "GPT-SoVITS默认风格配置不完整(需要refer_wav和prompt_text)"
+
+ return True, ""
+
+ async def execute(
+ self,
+ text: str,
+ voice: Optional[str] = None,
+ **kwargs
+ ) -> TTSResult:
+ """
+ 执行GPT-SoVITS语音合成
+
+ Args:
+ text: 待转换的文本
+ voice: 风格名称
+
+ Returns:
+ TTSResult
+ """
+ # 验证文本
+ if not text or not text.strip():
+ return TTSResult(False, "待合成的文本为空", backend_name=self.backend_name)
+
+ # 获取配置
+ server = self.get_config(ConfigKeys.GPT_SOVITS_SERVER, "http://127.0.0.1:9880")
+ styles_raw = self.get_config(ConfigKeys.GPT_SOVITS_STYLES, {})
+ styles = self._normalize_styles_config(styles_raw)
+ timeout = self.get_config(ConfigKeys.GENERAL_TIMEOUT, 60)
+
+ # 确定使用的风格
+ voice_style = voice if voice and voice in styles else "default"
+
+ if voice_style not in styles:
+ return TTSResult(
+ False,
+ f"GPT-SoVITS风格 '{voice_style}' 未配置",
+ backend_name=self.backend_name
+ )
+
+ style_config = styles[voice_style]
+ refer_wav_path = style_config.get("refer_wav", "")
+ prompt_text = style_config.get("prompt_text", "")
+ prompt_language = style_config.get("prompt_language", "zh")
+ gpt_weights = style_config.get("gpt_weights")
+ sovits_weights = style_config.get("sovits_weights")
+
+ if not refer_wav_path or not prompt_text:
+ return TTSResult(
+ False,
+ f"GPT-SoVITS风格 '{voice_style}' 配置不完整",
+ backend_name=self.backend_name
+ )
+
+ # 如果配置了模型权重,先切换模型
+ if gpt_weights or sovits_weights:
+ switch_success, switch_error = await self._switch_model(
+ server, gpt_weights, sovits_weights, timeout
+ )
+ if not switch_success:
+ return TTSResult(False, switch_error, backend_name=self.backend_name)
+
+ # 检测文本语言
+ text_language = TTSTextUtils.detect_language(text)
+
+ # 构建请求数据
+ data = {
+ "text": text,
+ "text_lang": text_language,
+ "ref_audio_path": refer_wav_path,
+ "prompt_text": prompt_text,
+ "prompt_lang": prompt_language
+ }
+
+ tts_url = f"{server.rstrip('/')}/tts"
+ legacy_tts_url = f"{server.rstrip('/')}/"
+ legacy_data = {
+ "text": text,
+ "text_language": text_language,
+ "refer_wav_path": refer_wav_path,
+ "prompt_text": prompt_text,
+ "prompt_language": prompt_language,
+ }
+
+ logger.info(f"{self.log_prefix} GPT-SoVITS请求: text='{text[:50]}...', style={voice_style}")
+
+ try:
+ session_manager = await TTSSessionManager.get_instance()
+ async with session_manager.post(
+ tts_url,
+ json=data,
+ backend_name="gpt_sovits",
+ timeout=timeout
+ ) as response:
+ if response.status == 200:
+ audio_data = await response.read()
+
+ # 验证音频数据
+ is_valid, error_msg = TTSFileManager.validate_audio_data(audio_data)
+ if not is_valid:
+ return TTSResult(False, f"GPT-SoVITS{error_msg}", backend_name=self.backend_name)
+
+ # 使用统一的发送方法
+ return await self.send_audio(
+ audio_data=audio_data,
+ audio_format="wav",
+ prefix="tts_gpt_sovits",
+ voice_info=f"风格: {voice_style}"
+ )
+ elif response.status == 404:
+ # 兼容旧版 api.py:没有 /tts 端点,回退到根路径
+ logger.warning(f"{self.log_prefix} /tts 端点不存在,尝试兼容模式请求根路径")
+ else:
+ error_info = await response.text()
+ logger.error(f"{self.log_prefix} GPT-SoVITS API失败[{response.status}]: {error_info[:200]}")
+ return TTSResult(
+ False,
+ f"GPT-SoVITS API调用失败: {response.status}",
+ backend_name=self.backend_name
+ )
+
+ # 仅在 /tts 404 时回退到旧版根路径
+ async with session_manager.post(
+ legacy_tts_url,
+ json=legacy_data,
+ backend_name="gpt_sovits",
+ timeout=timeout
+ ) as response:
+ if response.status == 200:
+ audio_data = await response.read()
+
+ # 验证音频数据
+ is_valid, error_msg = TTSFileManager.validate_audio_data(audio_data)
+ if not is_valid:
+ return TTSResult(False, f"GPT-SoVITS{error_msg}", backend_name=self.backend_name)
+
+ return await self.send_audio(
+ audio_data=audio_data,
+ audio_format="wav",
+ prefix="tts_gpt_sovits",
+ voice_info=f"风格: {voice_style}"
+ )
+ else:
+ error_info = await response.text()
+ logger.error(f"{self.log_prefix} GPT-SoVITS API失败[{response.status}]: {error_info[:200]}")
+ return TTSResult(
+ False,
+ f"GPT-SoVITS API调用失败: {response.status}",
+ backend_name=self.backend_name
+ )
+
+ except asyncio.TimeoutError:
+ return TTSResult(False, "GPT-SoVITS API调用超时", backend_name=self.backend_name)
+ except Exception as e:
+ logger.error(f"{self.log_prefix} GPT-SoVITS执行错误: {e}")
+ return TTSResult(False, f"GPT-SoVITS执行错误: {e}", backend_name=self.backend_name)
diff --git a/plugins/tts_voice_plugin/backends/gsv2p.py b/plugins/tts_voice_plugin/backends/gsv2p.py
new file mode 100644
index 00000000..8837d881
--- /dev/null
+++ b/plugins/tts_voice_plugin/backends/gsv2p.py
@@ -0,0 +1,186 @@
+"""
+GSV2P 后端实现
+使用 GSV2P 云端 API 进行语音合成
+"""
+
+import asyncio
+import json
+from typing import Optional, Dict, Any, Tuple
+from .base import TTSBackendBase, TTSResult
+from ..utils.file import TTSFileManager
+from ..utils.session import TTSSessionManager
+from ..config_keys import ConfigKeys
+from src.common.logger import get_logger
+
+logger = get_logger("tts_gsv2p")
+
+# 重试配置
+MAX_RETRIES = 5 # 最大重试次数
+RETRY_DELAY = 3.0 # 重试间隔(秒)
+
+
+class GSV2PBackend(TTSBackendBase):
+ """
+ GSV2P 后端
+
+ 使用 GSV2P 云端 API 进行高质量语音合成
+ """
+
+ backend_name = "gsv2p"
+ backend_description = "GSV2P云端API语音合成"
+ support_private_chat = True
+ default_audio_format = "mp3"
+
+ def get_default_voice(self) -> str:
+ """获取默认音色"""
+ return self.get_config(ConfigKeys.GSV2P_DEFAULT_VOICE, "原神-中文-派蒙_ZH")
+
+ def validate_config(self) -> Tuple[bool, str]:
+ """验证配置"""
+ api_token = self.get_config(ConfigKeys.GSV2P_API_TOKEN, "")
+ if not api_token:
+ return False, "GSV2P后端缺少API Token配置"
+ return True, ""
+
+ async def _make_request(
+ self,
+ api_url: str,
+ request_data: Dict[str, Any],
+ headers: Dict[str, str],
+ timeout: int
+ ) -> Tuple[bool, Any, str]:
+ """
+ 发送单次API请求
+
+ Returns:
+ (成功标志, 音频数据或None, 错误信息)
+ """
+ session_manager = await TTSSessionManager.get_instance()
+ async with session_manager.post(
+ api_url,
+ json=request_data,
+ headers=headers,
+ backend_name="gsv2p",
+ timeout=timeout
+ ) as response:
+ if response.status == 200:
+ content_type = response.headers.get('Content-Type', '')
+ audio_data = await response.read()
+
+ # 检查是否返回了JSON错误(服务端不稳定时会返回参数错误)
+ if 'application/json' in content_type:
+ try:
+ error_json = json.loads(audio_data.decode('utf-8'))
+ error_msg = error_json.get('error', {}).get('message', str(error_json))
+ # 参数错误通常是服务端临时问题,可以重试
+ return False, None, f"API返回错误: {error_msg}"
+ except Exception:
+ return False, None, "API返回异常响应"
+
+ # 验证音频数据
+ is_valid, error_msg = TTSFileManager.validate_audio_data(audio_data)
+ if not is_valid:
+ return False, None, f"音频数据无效: {error_msg}"
+
+ return True, audio_data, ""
+ else:
+ error_text = await response.text()
+ return False, None, f"API调用失败: {response.status} - {error_text[:100]}"
+
+ async def execute(
+ self,
+ text: str,
+ voice: Optional[str] = None,
+ **kwargs
+ ) -> TTSResult:
+ """
+ 执行GSV2P语音合成(带重试机制)
+
+ Args:
+ text: 待转换的文本
+ voice: 音色名称
+
+ Returns:
+ TTSResult
+ """
+ # 验证配置
+ is_valid, error_msg = self.validate_config()
+ if not is_valid:
+ return TTSResult(False, error_msg, backend_name=self.backend_name)
+
+ # 验证文本
+ if not text or not text.strip():
+ return TTSResult(False, "待合成的文本为空", backend_name=self.backend_name)
+
+ # 获取配置
+ api_url = self.get_config(ConfigKeys.GSV2P_API_URL, "https://gsv2p.acgnai.top/v1/audio/speech")
+ api_token = self.get_config(ConfigKeys.GSV2P_API_TOKEN, "")
+ timeout = self.get_config(ConfigKeys.GSV2P_TIMEOUT, 30)
+
+ if not voice:
+ voice = self.get_default_voice()
+
+ # 构建请求参数(注意:other_params 已被 API 废弃,不再支持)
+ request_data: Dict[str, Any] = {
+ "model": self.get_config(ConfigKeys.GSV2P_MODEL, "tts-v4"),
+ "input": text,
+ "voice": voice,
+ "response_format": self.get_config(ConfigKeys.GSV2P_RESPONSE_FORMAT, "mp3"),
+ "speed": self.get_config(ConfigKeys.GSV2P_SPEED, 1)
+ }
+
+ headers = {
+ "accept": "application/json",
+ "Authorization": f"Bearer {api_token}",
+ "Content-Type": "application/json"
+ }
+
+ logger.info(f"{self.log_prefix} GSV2P请求: text='{text[:50]}...', voice={voice}")
+ logger.debug(f"{self.log_prefix} GSV2P完整请求参数: {json.dumps(request_data, ensure_ascii=False, indent=2)}")
+
+ last_error = ""
+ for attempt in range(1, MAX_RETRIES + 1):
+ try:
+ success, audio_data, error_msg = await self._make_request(
+ api_url, request_data, headers, timeout
+ )
+
+ if success and audio_data:
+ if attempt > 1:
+ logger.info(f"{self.log_prefix} GSV2P第{attempt}次重试成功")
+
+ logger.info(f"{self.log_prefix} GSV2P响应: 数据大小={len(audio_data)}字节")
+
+ # 使用统一的发送方法
+ audio_format = self.get_config(ConfigKeys.GSV2P_RESPONSE_FORMAT, "mp3")
+ return await self.send_audio(
+ audio_data=audio_data,
+ audio_format=audio_format,
+ prefix="tts_gsv2p",
+ voice_info=f"音色: {voice}"
+ )
+ else:
+ last_error = error_msg
+ if attempt < MAX_RETRIES:
+ logger.warning(f"{self.log_prefix} GSV2P请求失败 ({error_msg}), {RETRY_DELAY}秒后重试 (尝试 {attempt}/{MAX_RETRIES})")
+ await asyncio.sleep(RETRY_DELAY)
+ else:
+ logger.error(f"{self.log_prefix} GSV2P请求失败,已达最大重试次数: {error_msg}")
+
+ except asyncio.TimeoutError:
+ last_error = "API调用超时"
+ if attempt < MAX_RETRIES:
+ logger.warning(f"{self.log_prefix} GSV2P超时, {RETRY_DELAY}秒后重试 (尝试 {attempt}/{MAX_RETRIES})")
+ await asyncio.sleep(RETRY_DELAY)
+ else:
+ logger.error(f"{self.log_prefix} GSV2P超时,已达最大重试次数")
+
+ except Exception as e:
+ last_error = str(e)
+ logger.error(f"{self.log_prefix} GSV2P执行错误: {e}")
+ if attempt < MAX_RETRIES:
+ await asyncio.sleep(RETRY_DELAY)
+ else:
+ break
+
+ return TTSResult(False, f"GSV2P {last_error} (已重试{MAX_RETRIES}次)", backend_name=self.backend_name)
diff --git a/plugins/tts_voice_plugin/config.toml b/plugins/tts_voice_plugin/config.toml
new file mode 100644
index 00000000..9c045560
--- /dev/null
+++ b/plugins/tts_voice_plugin/config.toml
@@ -0,0 +1,292 @@
+# tts_voice_plugin - 自动生成的配置文件
+# 统一TTS语音合成插件,整合AI Voice、GSV2P、GPT-SoVITS、豆包语音、CosyVoice五种后端引擎,提供灵活的语音合成能力。
+
+# 插件基本配置
+[plugin]
+
+# 是否启用插件
+enabled = true
+
+# 配置文件版本
+config_version = "3.2.3"
+
+# 通用设置
+
+[general]
+
+# 默认TTS后端 (ai_voice/gsv2p/gpt_sovits/doubao/cosyvoice/comfyui)
+# 可选: ai_voice/gsv2p/gpt_sovits/doubao/cosyvoice/comfyui/comfyui_voiceclone/comfyui_customvoice
+default_backend = "comfyui_customvoice"
+
+# 请求超时时间(秒)
+timeout = 60
+
+# 最大文本长度(该限制会在调用LLM时注入到prompt中,让LLM直接生成符合长度的回复,而不是被动截断)
+max_text_length = 200
+
+# 是否使用replyer润色语音内容
+use_replyer_rewrite = true
+
+# 音频文件输出目录(支持相对路径和绝对路径,留空使用项目根目录)
+audio_output_dir = ""
+
+# 是否使用base64编码发送音频(备选方案)
+use_base64_audio = true
+
+# 是否分段发送语音(每句话单独发送一条语音,避免长语音播放问题)
+split_sentences = true
+
+# 分段发送时每条语音之间的延迟(秒)
+split_delay = 0.3
+
+# 自动分段启用阈值:文本长度小于该值时不分段(避免短句被切成多段)
+split_min_total_chars = 120
+
+# 句子最小长度:过短片段会合并到前一句(用于减少碎片段)
+split_min_sentence_chars = 6
+
+# 自动分段最大段数(避免刷屏式多段语音)。0 表示不限制。
+split_max_segments = 3
+
+# 自动分段打包目标长度(字符)。用于把多句合并成更少段。
+split_chunk_chars = 110
+
+# 是否发送错误提示消息(关闭后语音合成失败时不会发送错误信息给用户)
+send_error_messages = true
+
+# 组件启用控制
+
+[components]
+
+# 是否启用Action组件
+action_enabled = true
+
+# 是否启用Command组件
+command_enabled = true
+
+# 是否启用 instruct 调试命令组件(/tts_instruct)
+instruct_command_enabled = true
+
+# 概率控制配置
+
+[probability]
+
+# 是否启用概率控制
+enabled = true
+
+# 基础触发概率
+base_probability = 1
+
+# 关键词强制触发
+keyword_force_trigger = true
+
+# 强制触发关键词
+force_keywords = [
+ "一定要用语音",
+ "必须语音",
+ "语音回复我",
+ "务必用语音",
+]
+
+# AI Voice后端配置
+
+[ai_voice]
+
+# 默认音色(可选:小新、猴哥、四郎、东北老妹儿、广西大表哥、妲己、霸道总裁、酥心御姐、说书先生、憨憨小弟、憨厚老哥、吕布、元气少女、文艺少女、磁性大叔、邻家小妹、低沉男声、傲娇少女、爹系男友、暖心姐姐、温柔妹妹、书香少女)
+default_character = "邻家小妹"
+
+# GSV2P后端配置
+
+[gsv2p]
+
+# GSV2P API地址
+api_url = "https://gsv2p.acgnai.top/v1/audio/speech"
+
+# API认证Token
+api_token = ""
+
+# 默认音色
+default_voice = "原神-中文-派蒙_ZH"
+
+# API请求超时(秒)
+timeout = 149
+
+# TTS模型
+model = "tts-v4"
+
+# 音频格式
+response_format = "wav"
+
+# 语音速度
+speed = 1
+
+# GPT-SoVITS后端配置
+
+[gpt_sovits]
+
+# GPT-SoVITS服务地址
+server = "http://127.0.0.1:9880"
+
+# 语音风格配置
+
+# 豆包语音后端配置
+
+[[gpt_sovits.styles]]
+name = "default"
+refer_wav = "/Users/xenon/Downloads/seiun_tts/qingyun_tiankong_voice/s978ztt245c3jxms6apadwgna4e7hmb.mp3"
+prompt_text = "私にしてはがんばった方ではないでしょーか?"
+prompt_language = "ja"
+gpt_weights = "/Users/xenon/Downloads/GPT-SoVITS/GPT_weights_v4/seiun-e15.ckpt"
+sovits_weights = "/Users/xenon/Downloads/GPT-SoVITS/SoVITS_weights_v4/seiun_e2_s144_l32.pth"
+
+[[gpt_sovits.styles]]
+name = ""
+refer_wav = ""
+prompt_text = ""
+prompt_language = "zh"
+gpt_weights = ""
+sovits_weights = ""
+
+[doubao]
+
+# 豆包语音API地址
+api_url = "https://openspeech.bytedance.com/api/v3/tts/unidirectional"
+
+# 豆包APP ID
+app_id = ""
+
+# 豆包Access Key
+access_key = ""
+
+# 豆包Resource ID
+resource_id = "seed-tts-2.0"
+
+# 默认音色
+default_voice = "zh_female_vv_uranus_bigtts"
+
+# API请求超时(秒)
+timeout = 60
+
+# 音频格式
+audio_format = "wav"
+
+# 采样率
+sample_rate = 24000
+
+# 比特率
+bitrate = 128000
+
+# CosyVoice后端配置
+
+[cosyvoice]
+
+# Gradio API地址
+gradio_url = "https://funaudiollm-fun-cosyvoice3-0-5b.ms.show/"
+
+# 推理模式(3s极速复刻/自然语言控制)
+default_mode = "3s极速复刻"
+
+# 默认指令(用于自然语言控制模式)
+default_instruct = "You are a helpful assistant. 请用广东话表达。<|endofprompt|>"
+
+# 参考音频路径(用于3s极速复刻模式)
+reference_audio = ""
+
+# 提示文本(用于3s极速复刻模式)
+prompt_text = ""
+
+# API请求超时(秒)
+timeout = 300
+
+# 音频格式
+audio_format = "wav"
+
+[comfyui]
+server = "http://127.0.0.1:8188"
+# 必须是 ComfyUI 的 input 目录, backend 会把 refer_wav 复制进去, 再用 LoadAudio 读取
+input_dir = "/Users/xenon/Downloads/seiun_tts/ComfyUI/ComfyUI/input"
+timeout = 120
+audio_quality = "128k" # SaveAudioMP3: V0/128k/320k
+mlx_python = "/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/.venv/bin/python"
+mlx_cli = "/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/mlx_voice_clone_cli.py"
+default_style = "default"
+# Split comfyui backend into two convenient aliases:
+# - comfyui_voiceclone: only uses styles whose mode is voice_clone (or absent)
+# - comfyui_customvoice: only uses styles whose mode is custom_voice
+# These keys let you pick different defaults without duplicating comfyui.styles.
+voiceclone_default_style = "default"
+customvoice_default_style = "seiun"
+auto_instruct_enabled = true
+auto_instruct_max_chars = 320
+
+# 自动推断 instruct 时固定附加的“基调”(persona)。会作为 `基调=...;` 前缀插入。
+# 注意:值里不要包含 ';' 或 '='(backend 会做清洗,但建议从源头避免)。
+auto_instruct_base_tone = "女性约15-16岁,清澈透亮但慵懒的轻女高音,句尾元音随意拉长且略带鼻腔撒娇,咬字松弛像刚睡醒,可在慵懒与冷静锐利间切换,带戏谑亲和"
+
+# 可选:完整基调原文(保留备份,当前不启用)
+# auto_instruct_base_tone = """
+# 女性,外表约15-16岁,音色是清澈透亮却带有慵懒感的轻女高音(Light Soprano)。
+#
+# 嗓音轻盈飘逸,带有明显的“云朵般”的漂浮感,起初是漫不经心的拖沓语调,其特征在于句尾元音的随意拉长(Drawl)以及略带鼻腔共鸣的撒娇感。咬字呈现出一种仿佛刚睡醒般的松弛,甚至伴有刻意为之的含糊,像是一只在阳光下伸懒腰的猫。
+#
+# 随后,这种慵懒被一种狡黠的机敏所取代,声音在毫无干劲的叹息与看穿一切的通透感之间自如切换。在表现谋略或胜负欲的瞬间,音色会瞬间收紧,去除了所有的气声装饰与慵懒拖音,转为冷静、干练且直击要害的中高频。
+#
+# 表现风格既显得捉摸不透又带有戏谑的亲和力,伴随着轻巧的换气声和偶尔出现的、带有试探意味的升调尾音。仿佛在脱力系(Listless)的无害表象之下,潜藏着绝顶聪明的头脑与绝不让步的自尊。
+# """
+
+auto_instruct_prompt = """
+你是精通声学特征与戏剧表演的 AI 配音导演。你的任务是根据「待朗读文本」生成一行 TTS instruct(用于 Qwen3-TTS CustomVoice 的语音表演控制)。
+
+硬性要求:
+- 只输出一行(单行 KV),不要解释,不要引号/代码块,不要复述原文。
+- 必须同时包含以下字段,并用英文分号 ';' 分隔:情绪、强度、语速、停顿、表现
+- 输出格式固定为:情绪=<...>;强度=<...>;语速=<...>;停顿=<...>;表现=<...>
+- 语速可选:很慢/稍慢/正常/稍快/很快
+- 停顿可选:很少/自然/稍多/很多
+- 强度可选:很低/低/中/高/很高
+- 表现:用 3-6 个短提示词,使用逗号分隔(不要用分号),如:声压高,咬字重,重音强,尾音下压
+- 长度 <= {max_chars} 字
+
+强制增强规则(避免“生气但听起来不够生气”):
+- 如果文本出现:非常/极其/真的/气死/怒/吼/滚/闭嘴/你再说一次 等强烈信号,情绪优先用「愤怒」,强度至少「高」,表现要包含“声压高/咬字重/重音强/尾音下压”中的至少 2 项。
+- 如果是嘲讽或冷笑式的怒气:情绪写「愤怒(冷)」或「愤怒+嘲讽」,表现包含“冷硬/压低/咬字利落/少气声”。
+
+文本语言: {lang}
+待朗读文本: {text}
+"""
+
+# 基础停顿(秒)。当 instruct 包含“停顿=...”时,会按 很少/自然/稍多/很多 做倍率缩放。
+pause_linebreak = 0.18
+period_pause = 0.22
+comma_pause = 0.1
+question_pause = 0.2
+hyphen_pause = 0.06
+
+[[comfyui.styles]]
+name = "default"
+refer_wav = "/Users/xenon/Downloads/seiun_tts/qingyun_tiankong_voice/default_ref_24k_mono.wav"
+prompt_text = "私にしてはがんばった方ではないでしょーか?"
+language = "Auto"
+model_choice = "1.7B"
+precision = "bf16"
+seed = 0
+max_new_tokens = 2048
+top_p = 0.8
+top_k = 20
+temperature = 1
+repetition_penalty = 1.05
+
+[[comfyui.styles]]
+name = "seiun"
+mode = "custom_voice"
+model_path = "/Users/xenon/Downloads/checkpoint-epoch-9"
+speaker = "seiun"
+instruct = "__AUTO__"
+speed = 1
+language = "Auto"
+seed = 0
+max_new_tokens = 2048
+top_p = 0.9
+top_k = 20
+temperature = 0.9
+repetition_penalty = 1.05
diff --git a/plugins/tts_voice_plugin/config_keys.py b/plugins/tts_voice_plugin/config_keys.py
new file mode 100644
index 00000000..b7993eca
--- /dev/null
+++ b/plugins/tts_voice_plugin/config_keys.py
@@ -0,0 +1,103 @@
+"""
+配置键常量定义
+集中管理所有配置键,避免硬编码
+"""
+
+
+class ConfigKeys:
+ """配置键常量类"""
+
+ # ========== Plugin 配置 ==========
+ PLUGIN_ENABLED = "plugin.enabled"
+ PLUGIN_CONFIG_VERSION = "plugin.config_version"
+
+ # ========== General 通用配置 ==========
+ GENERAL_DEFAULT_BACKEND = "general.default_backend"
+ GENERAL_TIMEOUT = "general.timeout"
+ GENERAL_MAX_TEXT_LENGTH = "general.max_text_length"
+ GENERAL_USE_REPLYER_REWRITE = "general.use_replyer_rewrite"
+ GENERAL_AUDIO_OUTPUT_DIR = "general.audio_output_dir"
+ GENERAL_USE_BASE64_AUDIO = "general.use_base64_audio"
+ GENERAL_SPLIT_SENTENCES = "general.split_sentences"
+ GENERAL_SPLIT_DELAY = "general.split_delay"
+ GENERAL_SPLIT_MIN_TOTAL_CHARS = "general.split_min_total_chars"
+ GENERAL_SPLIT_MIN_SENTENCE_CHARS = "general.split_min_sentence_chars"
+ GENERAL_SPLIT_MAX_SEGMENTS = "general.split_max_segments"
+ GENERAL_SPLIT_CHUNK_CHARS = "general.split_chunk_chars"
+ GENERAL_SEND_ERROR_MESSAGES = "general.send_error_messages"
+
+ # ========== Components 组件配置 ==========
+ COMPONENTS_ACTION_ENABLED = "components.action_enabled"
+ COMPONENTS_COMMAND_ENABLED = "components.command_enabled"
+ COMPONENTS_INSTRUCT_COMMAND_ENABLED = "components.instruct_command_enabled"
+
+ # ========== Probability 概率控制配置 ==========
+ PROBABILITY_ENABLED = "probability.enabled"
+ PROBABILITY_BASE_PROBABILITY = "probability.base_probability"
+ PROBABILITY_KEYWORD_FORCE_TRIGGER = "probability.keyword_force_trigger"
+ PROBABILITY_FORCE_KEYWORDS = "probability.force_keywords"
+
+ # ========== AI Voice 配置 ==========
+ AI_VOICE_DEFAULT_CHARACTER = "ai_voice.default_character"
+ AI_VOICE_ALIAS_MAP = "ai_voice.alias_map"
+
+ # ========== GSV2P 配置 ==========
+ GSV2P_API_URL = "gsv2p.api_url"
+ GSV2P_API_TOKEN = "gsv2p.api_token"
+ GSV2P_DEFAULT_VOICE = "gsv2p.default_voice"
+ GSV2P_TIMEOUT = "gsv2p.timeout"
+ GSV2P_MODEL = "gsv2p.model"
+ GSV2P_RESPONSE_FORMAT = "gsv2p.response_format"
+ GSV2P_SPEED = "gsv2p.speed"
+
+ # ========== GPT-SoVITS 配置 ==========
+ GPT_SOVITS_SERVER = "gpt_sovits.server"
+ GPT_SOVITS_STYLES = "gpt_sovits.styles"
+
+ # ========== Doubao 豆包配置 ==========
+ DOUBAO_API_URL = "doubao.api_url"
+ DOUBAO_APP_ID = "doubao.app_id"
+ DOUBAO_ACCESS_KEY = "doubao.access_key"
+ DOUBAO_RESOURCE_ID = "doubao.resource_id"
+ DOUBAO_DEFAULT_VOICE = "doubao.default_voice"
+ DOUBAO_TIMEOUT = "doubao.timeout"
+ DOUBAO_AUDIO_FORMAT = "doubao.audio_format"
+ DOUBAO_SAMPLE_RATE = "doubao.sample_rate"
+ DOUBAO_BITRATE = "doubao.bitrate"
+ DOUBAO_SPEED = "doubao.speed"
+ DOUBAO_VOLUME = "doubao.volume"
+ DOUBAO_CONTEXT_TEXTS = "doubao.context_texts"
+
+ # ========== CosyVoice 配置 ==========
+ COSYVOICE_GRADIO_URL = "cosyvoice.gradio_url"
+ COSYVOICE_DEFAULT_MODE = "cosyvoice.default_mode"
+ COSYVOICE_DEFAULT_INSTRUCT = "cosyvoice.default_instruct"
+ COSYVOICE_REFERENCE_AUDIO = "cosyvoice.reference_audio"
+ COSYVOICE_PROMPT_TEXT = "cosyvoice.prompt_text"
+ COSYVOICE_TIMEOUT = "cosyvoice.timeout"
+ COSYVOICE_AUDIO_FORMAT = "cosyvoice.audio_format"
+
+ # ========== ComfyUI (Workflow API) 配置 ==========
+ COMFYUI_SERVER = "comfyui.server"
+ COMFYUI_INPUT_DIR = "comfyui.input_dir"
+ COMFYUI_TIMEOUT = "comfyui.timeout"
+ COMFYUI_DEFAULT_STYLE = "comfyui.default_style"
+ COMFYUI_STYLES = "comfyui.styles"
+ # Convenience aliases to split voiceclone/customvoice at the plugin level.
+ # Both backends still use comfyui.styles, but these keys let you pick different defaults.
+ COMFYUI_VOICECLONE_DEFAULT_STYLE = "comfyui.voiceclone_default_style"
+ COMFYUI_CUSTOMVOICE_DEFAULT_STYLE = "comfyui.customvoice_default_style"
+ COMFYUI_AUDIO_QUALITY = "comfyui.audio_quality"
+ COMFYUI_MLX_PYTHON = "comfyui.mlx_python"
+ COMFYUI_MLX_CLI = "comfyui.mlx_cli"
+ COMFYUI_PAUSE_LINEBREAK = "comfyui.pause_linebreak"
+ COMFYUI_PERIOD_PAUSE = "comfyui.period_pause"
+ COMFYUI_COMMA_PAUSE = "comfyui.comma_pause"
+ COMFYUI_QUESTION_PAUSE = "comfyui.question_pause"
+ COMFYUI_HYPHEN_PAUSE = "comfyui.hyphen_pause"
+
+ # Auto instruct (CustomVoice)
+ COMFYUI_AUTO_INSTRUCT_ENABLED = "comfyui.auto_instruct_enabled"
+ COMFYUI_AUTO_INSTRUCT_BASE_TONE = "comfyui.auto_instruct_base_tone"
+ COMFYUI_AUTO_INSTRUCT_PROMPT = "comfyui.auto_instruct_prompt"
+ COMFYUI_AUTO_INSTRUCT_MAX_CHARS = "comfyui.auto_instruct_max_chars"
diff --git a/plugins/tts_voice_plugin/plugin.py b/plugins/tts_voice_plugin/plugin.py
new file mode 100644
index 00000000..8ee2b155
--- /dev/null
+++ b/plugins/tts_voice_plugin/plugin.py
@@ -0,0 +1,972 @@
+"""
+统一TTS语音合成插件
+支持五种后端:AI Voice (MaiCore内置) / GSV2P (云API) / GPT-SoVITS (本地服务) / 豆包语音 (云API) / CosyVoice (ModelScope Gradio)
+
+Version: 3.2.3
+Author: 靓仔
+"""
+
+import sys
+sys.dont_write_bytecode = True
+
+import asyncio
+import random
+from typing import List, Tuple, Type, Optional
+
+from src.common.logger import get_logger
+from src.plugin_system.base.base_plugin import BasePlugin
+from src.plugin_system.apis.plugin_register_api import register_plugin
+from src.plugin_system.base.base_action import BaseAction, ActionActivationType
+from src.plugin_system.base.base_command import BaseCommand
+from src.plugin_system.base.component_types import ComponentInfo, ChatMode
+from src.plugin_system.base.config_types import ConfigField
+from src.plugin_system.apis import generator_api
+
+# 导入模块化的后端和工具
+from .backends import TTSBackendRegistry, TTSResult
+from .backends.ai_voice import AI_VOICE_ALIAS_MAP
+from .backends.doubao import DOUBAO_EMOTION_MAP
+from .utils.text import TTSTextUtils
+from .config_keys import ConfigKeys
+
+logger = get_logger("tts_voice_plugin")
+
+# 有效后端列表
+VALID_BACKENDS = [
+ "ai_voice",
+ "gsv2p",
+ "gpt_sovits",
+ "doubao",
+ "cosyvoice",
+ "comfyui",
+ "comfyui_voiceclone",
+ "comfyui_customvoice",
+]
+
+
+class TTSExecutorMixin:
+ """
+ TTS执行器混入类
+
+ 提供 Action 和 Command 共享的后端执行逻辑
+ """
+
+ def _create_backend(self, backend_name: str):
+ """
+ 创建后端实例
+
+ Args:
+ backend_name: 后端名称
+
+ Returns:
+ 后端实例
+ """
+ backend = TTSBackendRegistry.create(
+ backend_name,
+ self.get_config,
+ self.log_prefix
+ )
+
+ if backend:
+ # 注入必要的回调函数
+ if hasattr(backend, 'set_send_custom'):
+ backend.set_send_custom(self.send_custom)
+ if hasattr(backend, 'set_send_command'):
+ backend.set_send_command(self.send_command)
+
+ return backend
+
+ async def _execute_backend(
+ self,
+ backend_name: str,
+ text: str,
+ voice: str = "",
+ emotion: str = ""
+ ) -> TTSResult:
+ """
+ 执行指定后端
+
+ Args:
+ backend_name: 后端名称
+ text: 待转换文本
+ voice: 音色
+ emotion: 情感(豆包后端)
+
+ Returns:
+ TTSResult
+ """
+ backend = self._create_backend(backend_name)
+
+ if not backend:
+ return TTSResult(
+ success=False,
+ message=f"未知的TTS后端: {backend_name}"
+ )
+
+ # AI Voice 私聊限制检查
+ if backend_name == "ai_voice":
+ is_private = self._check_is_private_chat()
+ if is_private:
+ logger.info(f"{self.log_prefix} AI语音仅支持群聊,自动切换到GSV2P后端")
+ return await self._execute_backend("gsv2p", text, voice, emotion)
+
+ # Pass chat context through for backends that need MaiBot LLM APIs (e.g., comfyui auto_instruct).
+ chat_stream = None
+ if hasattr(self, "chat_stream"):
+ chat_stream = getattr(self, "chat_stream", None)
+ elif hasattr(self, "message"):
+ chat_stream = getattr(getattr(self, "message", None), "chat_stream", None)
+
+ return await backend.execute(text, voice, emotion=emotion, chat_stream=chat_stream)
+
+ def _check_is_private_chat(self) -> bool:
+ """检查是否是私聊"""
+ # Action 中使用 chat_stream
+ if hasattr(self, 'chat_stream'):
+ return not getattr(self.chat_stream, 'group_info', None)
+ # Command 中使用 message
+ if hasattr(self, 'message'):
+ msg_info = getattr(self.message, 'message_info', None)
+ if msg_info:
+ return not getattr(msg_info, 'group_info', None)
+ return False
+
+ def _get_default_backend(self) -> str:
+ """获取配置的默认后端"""
+ backend = self.get_config(ConfigKeys.GENERAL_DEFAULT_BACKEND, "gsv2p")
+ if backend not in VALID_BACKENDS:
+ logger.warning(f"{self.log_prefix} 配置的默认后端 '{backend}' 无效,使用 gsv2p")
+ return "gsv2p"
+ return backend
+
+ async def _send_error(self, message: str) -> None:
+ """
+ 发送错误提示信息(受全局配置控制)
+
+ Args:
+ message: 错误消息
+ """
+ if self.get_config(ConfigKeys.GENERAL_SEND_ERROR_MESSAGES, True):
+ await self.send_text(message)
+
+
+class UnifiedTTSAction(BaseAction, TTSExecutorMixin):
+ """统一TTS Action - LLM自动触发"""
+
+ action_name = "unified_tts_action"
+ action_description = "用语音回复(支持AI Voice/GSV2P/GPT-SoVITS/豆包语音多后端)"
+ activation_type = ActionActivationType.KEYWORD
+ mode_enable = ChatMode.ALL
+ parallel_action = False
+
+ activation_keywords = [
+ "语音", "说话", "朗读", "念一下", "读出来",
+ "voice", "speak", "tts", "语音回复", "用语音说", "播报"
+ ]
+ keyword_case_sensitive = False
+
+ action_parameters = {
+ "text": "要转换为语音的文本内容(必填)",
+ "backend": "TTS后端引擎 (ai_voice/gsv2p/gpt_sovits/doubao/cosyvoice/comfyui/comfyui_voiceclone/comfyui_customvoice,可选,建议省略让系统自动使用配置的默认后端)",
+ "voice": "音色/风格参数(可选)",
+ "emotion": "情感/语气参数(可选,仅豆包后端有效)。支持:开心/兴奋/温柔/骄傲/生气/愤怒/伤心/失望/委屈/平静/严肃/疑惑/慢速/快速/小声/大声等"
+ }
+
+ action_require = [
+ "当用户要求用语音回复时使用",
+ "当回复简短问候语时使用(如早上好、晚安、你好等)",
+ "当想让回复更活泼生动时可以使用",
+ "注意:回复内容过长或者过短不适合用语音",
+ "注意:backend参数建议省略,系统会自动使用配置的默认后端"
+ ]
+
+ associated_types = ["text", "command"]
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.timeout = self.get_config(ConfigKeys.GENERAL_TIMEOUT, 60)
+ self.max_text_length = self.get_config(ConfigKeys.GENERAL_MAX_TEXT_LENGTH, 500)
+
+ def _check_force_trigger(self, text: str) -> bool:
+ """检查是否强制触发"""
+ if not self.get_config(ConfigKeys.PROBABILITY_KEYWORD_FORCE_TRIGGER, True):
+ return False
+ force_keywords = self.get_config(
+ ConfigKeys.PROBABILITY_FORCE_KEYWORDS,
+ ["一定要用语音", "必须语音", "语音回复我", "务必用语音"]
+ )
+ return any(kw in text for kw in force_keywords)
+
+ def _probability_check(self, text: str) -> bool:
+ """概率控制检查"""
+ if not self.get_config(ConfigKeys.PROBABILITY_ENABLED, True):
+ return True
+
+ base_prob = self.get_config(ConfigKeys.PROBABILITY_BASE_PROBABILITY, 1.0)
+ base_prob = max(0.0, min(1.0, base_prob))
+ result = random.random() < base_prob
+ logger.info(f"{self.log_prefix} 概率检查: {base_prob:.2f}, 结果={'通过' if result else '未通过'}")
+ return result
+
+ async def _get_final_text(self, raw_text: str, reason: str, use_replyer: bool) -> Tuple[bool, str]:
+ """获取最终要转语音的文本(使用与正常回复一致的prompt参数)"""
+ max_text_length = self.get_config(ConfigKeys.GENERAL_MAX_TEXT_LENGTH, 200)
+
+ if not use_replyer:
+ if not raw_text:
+ return False, ""
+ return True, raw_text
+
+ try:
+ # 统一使用 generate_reply 以确保触发 POST_LLM 事件(日程注入)
+ # rewrite_reply 不会触发 POST_LLM 事件,因此不适用
+ # 注意:长度约束放在末尾,利用 LLM 的"近因效应"提高遵守率
+ extra_info_parts = []
+ if raw_text:
+ extra_info_parts.append(f"期望的回复内容:{raw_text}")
+ # 长度约束放在最后,使用更强的表述
+ extra_info_parts.append(
+ f"【重要】你的回复必须控制在{max_text_length}字以内,这是硬性要求。"
+ f"超过此长度将无法转换为语音。请直接回复核心内容,不要啰嗦。"
+ )
+
+ success, llm_response = await generator_api.generate_reply(
+ chat_stream=self.chat_stream,
+ reply_message=self.action_message,
+ reply_reason=reason,
+ extra_info="\n".join(extra_info_parts),
+ request_type="tts_voice_plugin",
+ from_plugin=False # 允许触发POST_LLM事件,使日程注入生效
+ )
+ if success and llm_response and llm_response.content:
+ logger.info(f"{self.log_prefix} 语音内容生成成功")
+ return True, llm_response.content.strip()
+
+ # 如果生成失败但有原始文本,则使用原始文本
+ if raw_text:
+ logger.warning(f"{self.log_prefix} 内容生成失败,使用原始文本")
+ return True, raw_text
+
+ return False, ""
+ except Exception as e:
+ logger.error(f"{self.log_prefix} 调用 replyer 出错: {e}")
+ return bool(raw_text), raw_text
+
+ async def execute(self) -> Tuple[bool, str]:
+ def _chunk_sentences(
+ parts: List[str], target_chars: int, max_chunks: int
+ ) -> List[str]:
+ # Greedy packing: reduces tiny fragments into fewer, longer segments.
+ if not parts:
+ return []
+ if target_chars <= 0:
+ target_chars = 120
+
+ def pack(tgt: int) -> List[str]:
+ out: List[str] = []
+ cur = ""
+ for s in parts:
+ s = (s or "").strip()
+ if not s:
+ continue
+ if not cur:
+ cur = s
+ continue
+ if len(cur) + len(s) <= tgt:
+ cur += s
+ else:
+ out.append(cur)
+ cur = s
+ if cur:
+ out.append(cur)
+ return out
+
+ packed = pack(target_chars)
+ if max_chunks and max_chunks > 0 and len(packed) > max_chunks:
+ total = len("".join(parts))
+ new_target = max(target_chars, int(total / max_chunks) + 1)
+ packed = pack(new_target)
+ return packed
+
+ async def send_message_single_sentences() -> Tuple[bool, str]:
+ result = await self._execute_backend(backend, clean_text, voice, emotion)
+ if result.success:
+ # 生成更详细的动作记录,帮助 planner 避免重复执行
+ text_preview = clean_text[:80] + "..." if len(clean_text) > 80 else clean_text
+ await self.store_action_info(
+ action_build_into_prompt=True,
+ action_prompt_display=f"已用语音回复:{text_preview}",
+ action_done=True
+ )
+ else:
+ await self._send_error(f"语音合成失败: {result.message}")
+
+ return result.success, result.message
+ async def send_message_with_splited_sentences() -> Tuple[bool, str]:
+ # 分段发送模式:将文本分割成句子,逐句发送语音
+ if len(sentences) > 1:
+ logger.info(f"{self.log_prefix} 分段发送模式:共 {len(sentences)} 句")
+
+ success_count = 0
+ all_sentences_text = []
+
+ for i, sentence in enumerate(sentences):
+ if not sentence.strip():
+ continue
+
+ logger.debug(f"{self.log_prefix} 发送第 {i + 1}/{len(sentences)} 句: {sentence[:30]}...")
+ result = await self._execute_backend(backend, sentence, voice, emotion)
+
+ if result.success:
+ success_count += 1
+ all_sentences_text.append(sentence)
+ else:
+ logger.warning(f"{self.log_prefix} 第 {i + 1} 句发送失败: {result.message}")
+
+ # 句子之间添加延迟
+ if i < len(sentences) - 1 and split_delay > 0:
+ await asyncio.sleep(split_delay)
+
+ # 记录动作信息
+ if success_count > 0:
+ # 生成更详细的动作记录,帮助 planner 避免重复执行
+ display_text = "".join(all_sentences_text)
+ text_preview = display_text[:80] + "..." if len(display_text) > 80 else display_text
+ await self.store_action_info(
+ action_build_into_prompt=True,
+ action_prompt_display=f"已用语音回复({success_count}段):{text_preview}",
+ action_done=True
+ )
+ return True, f"成功发送 {success_count}/{len(sentences)} 条语音"
+ else:
+ await self._send_error("语音合成失败")
+ return False, "所有语音发送失败"
+ else:
+ # 只有一句,正常发送
+ return await send_message_single_sentences()
+
+ """执行TTS语音合成"""
+ try:
+ raw_text = self.action_data.get("text", "").strip()
+ voice = self.action_data.get("voice", "")
+ reason = self.action_data.get("reason", "")
+ emotion = self.action_data.get("emotion", "")
+
+ use_replyer = self.get_config(ConfigKeys.GENERAL_USE_REPLYER_REWRITE, True)
+
+ # 获取最终文本
+ success, final_text = await self._get_final_text(raw_text, reason, use_replyer)
+ if not success or not final_text:
+ await self._send_error("无法生成语音内容")
+ return False, "文本为空"
+
+ # 概率检查
+ force_trigger = self._check_force_trigger(final_text)
+ if not force_trigger and not self._probability_check(final_text):
+ logger.info(f"{self.log_prefix} 概率检查未通过,使用文字回复")
+ await self.send_text(final_text)
+ text_preview = final_text[:80] + "..." if len(final_text) > 80 else final_text
+ await self.store_action_info(
+ action_build_into_prompt=True,
+ action_prompt_display=f"已用文字回复(语音概率未触发):{text_preview}",
+ action_done=True
+ )
+ return True, "概率检查未通过,已发送文字回复"
+
+ # 清理文本(移除特殊字符,替换网络用语)
+ # 注意:长度应该由LLM在生成时就遵守,这里只做字符清理
+ clean_text = TTSTextUtils.clean_text(final_text, self.max_text_length)
+ if not clean_text:
+ await self._send_error("文本处理后为空")
+ return False, "文本处理后为空"
+
+ # 如果清理后的文本仍然超过限制,说明LLM未遵守约束
+ if len(clean_text) > self.max_text_length:
+ logger.warning(
+ f"{self.log_prefix} LLM生成的文本超过长度限制 "
+ f"({len(clean_text)} > {self.max_text_length}字符),降级为文字回复"
+ )
+ await self.send_text(clean_text)
+ text_preview = clean_text[:80] + "..." if len(clean_text) > 80 else clean_text
+ await self.store_action_info(
+ action_build_into_prompt=True,
+ action_prompt_display=f"已用文字回复(内容过长):{text_preview}",
+ action_done=True
+ )
+ return True, "内容超过语音长度限制,已改为文字回复"
+
+ # 获取后端并执行
+ backend = self._get_default_backend()
+ logger.info(f"{self.log_prefix} 使用配置的默认后端: {backend}")
+
+ # 检查是否启用分段发送
+ split_sentences = self.get_config(ConfigKeys.GENERAL_SPLIT_SENTENCES, True)
+ split_delay = self.get_config(ConfigKeys.GENERAL_SPLIT_DELAY, 0.3)
+
+ sentences = None
+
+ # 优先使用智能分割插件的分隔符
+ if '|||SPLIT|||' in clean_text:
+ logger.info("found split marker from smart segmentation plugin")
+ sentences = [s.strip() for s in clean_text.split("|||SPLIT|||") if s.strip()]
+ # If the upstream splitter is too aggressive, pack back into fewer segments.
+ max_segments = int(self.get_config(ConfigKeys.GENERAL_SPLIT_MAX_SEGMENTS, 3) or 3)
+ chunk_chars = int(self.get_config(ConfigKeys.GENERAL_SPLIT_CHUNK_CHARS, 110) or 110)
+ if max_segments and max_segments > 0 and len(sentences) > max_segments:
+ sentences = _chunk_sentences(sentences, target_chars=chunk_chars, max_chunks=max_segments)
+ return await send_message_with_splited_sentences()
+ elif split_sentences:
+ # 自动分段:短文本不分段;长文本最多分成 N 段,避免刷屏式多段语音。
+ min_total = int(self.get_config(ConfigKeys.GENERAL_SPLIT_MIN_TOTAL_CHARS, 120) or 120)
+ min_sentence = int(self.get_config(ConfigKeys.GENERAL_SPLIT_MIN_SENTENCE_CHARS, 6) or 6)
+ max_segments = int(self.get_config(ConfigKeys.GENERAL_SPLIT_MAX_SEGMENTS, 3) or 3)
+ chunk_chars = int(self.get_config(ConfigKeys.GENERAL_SPLIT_CHUNK_CHARS, 110) or 110)
+
+ if len(clean_text) < min_total:
+ sentences = [clean_text]
+ else:
+ sentences = TTSTextUtils.split_sentences(clean_text, min_length=min_sentence)
+ if max_segments and max_segments > 0:
+ sentences = _chunk_sentences(sentences, target_chars=chunk_chars, max_chunks=max_segments)
+ return await send_message_with_splited_sentences()
+ else:
+ # 单句发送
+ return await send_message_single_sentences()
+
+ except Exception as e:
+ error_msg = str(e)
+ logger.error(f"{self.log_prefix} TTS语音合成出错: {error_msg}")
+ await self._send_error(f"语音合成出错: {error_msg}")
+ return False, error_msg
+
+
+class UnifiedTTSCommand(BaseCommand, TTSExecutorMixin):
+ """统一TTS Command - 用户手动触发"""
+
+ command_name = "unified_tts_command"
+ command_description = "将文本转换为语音,支持多种后端和音色"
+ command_pattern = r"^/(?:tts|voice|gsv2p|gptsovits|doubao|cosyvoice|comfyui|comfyui_voiceclone|comfyui_customvoice)\s+(?P.+?)(?:\s+-v\s+(?P\S+))?(?:\s+(?Pai_voice|gsv2p|gpt_sovits|doubao|cosyvoice|comfyui|comfyui_voiceclone|comfyui_customvoice))?$"
+ command_help = "将文本转换为语音。用法:/tts 你好世界 [-v 音色] [后端]"
+ command_examples = [
+ "/tts 你好,世界!",
+ "/tts 今天天气不错 -v 小新",
+ "/gptsovits 你好世界 -v default",
+ "/cosyvoice 你好世界 -v 四川话",
+ "/tts 试试 -v 温柔妹妹 ai_voice",
+ "/gsv2p 你好世界",
+ "/doubao 你好世界 -v 开心"
+ ]
+ intercept_message = True
+
+ async def _send_help(self):
+ """发送帮助信息"""
+ default_backend = self._get_default_backend()
+
+ help_text = """【TTS语音合成插件帮助】
+
+📝 基本语法:
+/tts <文本> [-v <音色>] [后端]
+
+ 🎯 快捷命令:
+ /tts <文本> 使用默认后端
+ /voice <文本> 使用 AI Voice
+ /gsv2p <文本> 使用 GSV2P
+ /gptsovits <文本> 使用 GPT-SoVITS
+ /doubao <文本> 使用 豆包语音
+ /cosyvoice <文本> 使用 CosyVoice
+ /comfyui <文本> 使用 ComfyUI(本地工作流)
+ /comfyui_voiceclone <文本> 使用 ComfyUI VoiceClone
+ /comfyui_customvoice <文本> 使用 ComfyUI CustomVoice
+
+ 🔊 可用后端:
+ • ai_voice - MaiCore内置(仅群聊)
+ • gsv2p - 云端API,高质量
+ • gpt_sovits - 本地服务,可定制
+ • doubao - 火山引擎,支持情感
+ • cosyvoice - 阿里云,支持方言
+ • comfyui - 本地ComfyUI工作流(自动按 style.mode 选择)
+ • comfyui_voiceclone - 本地ComfyUI工作流(仅 VoiceClone)
+ • comfyui_customvoice - 本地ComfyUI工作流(仅 CustomVoice)
+
+🎭 音色/情感参数(-v):
+• AI Voice: 小新、温柔妹妹、霸道总裁、妲己 等22种
+• GSV2P: 原神-中文-派蒙_ZH 等(见API文档)
+• 豆包: 开心、生气、伤心、撒娇、严肃 等
+• CosyVoice: 广东话、四川话、东北话、开心、慢速 等
+
+📌 示例:
+/tts 你好世界
+/tts 今天真开心 -v 开心
+/gptsovits 这是本地语音合成
+/doubao 我生气了 -v 生气
+/cosyvoice 你好 -v 广东话
+/voice 测试一下 -v 温柔妹妹
+
+⚙️ 当前默认后端:""" + default_backend
+
+ await self.send_text(help_text)
+
+ def _determine_backend(self, user_backend: str) -> Tuple[str, str]:
+ """
+ 确定使用的后端
+
+ Returns:
+ (backend_name, source_description)
+ """
+ # 1. 检查命令前缀
+ raw_text = self.message.raw_message if self.message.raw_message else self.message.processed_plain_text
+ if raw_text:
+ # 命令前缀到后端的映射
+ prefix_backend_map = {
+ "/gsv2p": "gsv2p",
+ "/gptsovits": "gpt_sovits",
+ "/doubao": "doubao",
+ "/cosyvoice": "cosyvoice",
+ "/voice": "ai_voice",
+ "/comfyui": "comfyui",
+ "/comfyui_voiceclone": "comfyui_voiceclone",
+ "/comfyui_customvoice": "comfyui_customvoice",
+ }
+ for prefix, backend in prefix_backend_map.items():
+ if raw_text.startswith(prefix):
+ return backend, f"命令前缀 {prefix}"
+
+ # 2. 检查命令参数
+ if user_backend and user_backend in VALID_BACKENDS:
+ return user_backend, f"命令参数 {user_backend}"
+
+ # 3. 使用配置文件默认值
+ return self._get_default_backend(), "配置文件"
+
+ async def execute(self) -> Tuple[bool, str, bool]:
+ """执行TTS命令"""
+ try:
+ text = self.matched_groups.get("text", "").strip()
+ voice = self.matched_groups.get("voice", "")
+ user_backend = self.matched_groups.get("backend", "")
+
+ # 处理帮助命令
+ if text.lower() == "help":
+ await self._send_help()
+ return True, "显示帮助信息", True
+
+ if not text:
+ await self._send_error("请输入要转换为语音的文本内容")
+ return False, "缺少文本内容", True
+
+ # 确定后端
+ backend, backend_source = self._determine_backend(user_backend)
+
+ # 清理文本
+ max_length = self.get_config(ConfigKeys.GENERAL_MAX_TEXT_LENGTH, 500)
+ clean_text = TTSTextUtils.clean_text(text, max_length)
+
+ if not clean_text:
+ await self._send_error("文本处理后为空")
+ return False, "文本处理后为空", True
+
+ # 检查长度限制
+ if len(clean_text) > max_length:
+ await self.send_text(
+ f"文本过长({len(clean_text)}字符),"
+ f"超过语音合成限制({max_length}字符),"
+ f"已改为文字发送。\n\n{clean_text}"
+ )
+ return True, "文本过长,已改为文字发送", True
+
+ logger.info(f"{self.log_prefix} 执行TTS命令 (后端: {backend} [来源: {backend_source}], 音色: {voice})")
+
+ # 执行后端
+ # 对于 CosyVoice 和豆包,voice 参数实际上是情感/方言
+ if backend in ["cosyvoice", "doubao"]:
+ result = await self._execute_backend(backend, clean_text, voice="", emotion=voice)
+ else:
+ result = await self._execute_backend(backend, clean_text, voice)
+
+ if not result.success:
+ await self._send_error(f"语音合成失败: {result.message}")
+
+ return result.success, result.message, True
+
+ except Exception as e:
+ logger.error(f"{self.log_prefix} TTS命令执行出错: {e}")
+ await self._send_error(f"语音合成出错: {e}")
+ return False, f"执行出错: {e}", True
+
+
+class TTSInstructCommand(BaseCommand):
+ """生成 CustomVoice instruct(调试/预览用)"""
+
+ command_name = "tts_instruct_command"
+ command_description = "根据待朗读文本生成 CustomVoice 的 instruct(情绪/语速/停顿)"
+ command_pattern = r"^/tts_instruct\\s+(?P.+?)$"
+ command_help = "用法:/tts_instruct <文本>"
+ command_examples = [
+ "/tts_instruct 早上好,今天也要加油。",
+ "/tts_instruct えっ?本当にそうなの?",
+ ]
+ intercept_message = True
+
+ async def execute(self) -> Tuple[bool, str, int]:
+ try:
+ text = (self.matched_groups.get("text") or "").strip()
+ if not text:
+ await self.send_text("请输入要生成 instruct 的文本")
+ return False, "缺少文本", 2
+
+ # Use the same logic as ComfyUI backend auto_instruct.
+ from .backends.comfyui import ComfyUIBackend
+ from .utils.text import TTSTextUtils
+
+ detected = TTSTextUtils.detect_language(text)
+ chat_stream = getattr(self.message, "chat_stream", None)
+ chat_id = getattr(chat_stream, "stream_id", None) if chat_stream else None
+
+ backend = ComfyUIBackend(self.get_config, log_prefix=self.log_prefix)
+ instruct = await backend._infer_instruct(
+ text=text,
+ detected_lang=detected,
+ chat_stream=chat_stream,
+ chat_id=chat_id,
+ style_name="__command__",
+ )
+
+ if not instruct:
+ await self.send_text("instruct 生成失败(可能未启用 comfyui.auto_instruct_enabled 或 LLM 不可用)")
+ return False, "instruct 生成失败", 2
+
+ await self.send_text(instruct)
+ return True, "instruct 已生成", 2
+ except Exception as e:
+ await self.send_text(f"instruct 生成异常: {e}")
+ return False, str(e), 2
+
+
+@register_plugin
+class UnifiedTTSPlugin(BasePlugin):
+ """统一TTS语音合成插件 - 支持多后端的文本转语音插件"""
+
+ plugin_name = "tts_voice_plugin"
+ plugin_description = "统一TTS语音合成插件,支持AI Voice、GSV2P、GPT-SoVITS、豆包语音多种后端"
+ plugin_version = "3.2.3"
+ plugin_author = "靓仔"
+ enable_plugin = True
+ config_file_name = "config.toml"
+ dependencies = []
+ python_dependencies = ["aiohttp"]
+
+ config_section_descriptions = {
+ "plugin": "插件基本配置",
+ "general": "通用设置",
+ "components": "组件启用控制",
+ "probability": "概率控制配置",
+ "ai_voice": "AI Voice后端配置",
+ "gsv2p": "GSV2P后端配置",
+ "gpt_sovits": "GPT-SoVITS后端配置",
+ "doubao": "豆包语音后端配置",
+ "cosyvoice": "CosyVoice后端配置",
+ "comfyui": "ComfyUI工作流API后端配置"
+ }
+
+ config_schema = {
+ "plugin": {
+ "enabled": ConfigField(type=bool, default=True, description="是否启用插件"),
+ "config_version": ConfigField(type=str, default="3.2.3", description="配置文件版本")
+ },
+ "general": {
+ "default_backend": ConfigField(
+ type=str, default="cosyvoice",
+ description="默认TTS后端 (ai_voice/gsv2p/gpt_sovits/doubao/cosyvoice/comfyui/comfyui_voiceclone/comfyui_customvoice)"
+ ),
+ "timeout": ConfigField(type=int, default=60, description="请求超时时间(秒)"),
+ "max_text_length": ConfigField(
+ type=int, default=200,
+ description="最大文本长度(该限制会在调用LLM时注入到prompt中,让LLM直接生成符合长度的回复,而不是被动截断)"
+ ),
+ "use_replyer_rewrite": ConfigField(
+ type=bool, default=True,
+ description="是否使用replyer润色语音内容"
+ ),
+ "audio_output_dir": ConfigField(
+ type=str, default="",
+ description="音频文件输出目录(支持相对路径和绝对路径,留空使用项目根目录)"
+ ),
+ "use_base64_audio": ConfigField(
+ type=bool, default=True,
+ description="是否使用base64编码发送音频(备选方案)"
+ ),
+ "split_sentences": ConfigField(
+ type=bool, default=True,
+ description="是否分段发送语音(每句话单独发送一条语音,避免长语音播放问题)"
+ ),
+ "split_delay": ConfigField(
+ type=float, default=0.3,
+ description="分段发送时每条语音之间的延迟(秒)"
+ ),
+ "split_min_total_chars": ConfigField(
+ type=int, default=120,
+ description="自动分段启用阈值:文本长度小于该值时不分段(避免短句被切成多段)",
+ ),
+ "split_min_sentence_chars": ConfigField(
+ type=int, default=6,
+ description="句子最小长度:过短片段会合并到前一句(用于减少碎片段)",
+ ),
+ "split_max_segments": ConfigField(
+ type=int, default=3,
+ description="自动分段最大段数(避免刷屏式多段语音)。0 表示不限制。",
+ ),
+ "split_chunk_chars": ConfigField(
+ type=int, default=110,
+ description="自动分段打包目标长度(字符)。用于把多句合并成更少段。",
+ ),
+ "send_error_messages": ConfigField(
+ type=bool, default=True,
+ description="是否发送错误提示消息(关闭后语音合成失败时不会发送错误信息给用户)"
+ )
+ },
+ "components": {
+ "action_enabled": ConfigField(type=bool, default=True, description="是否启用Action组件"),
+ "command_enabled": ConfigField(type=bool, default=True, description="是否启用Command组件"),
+ "instruct_command_enabled": ConfigField(type=bool, default=True, description="是否启用instruct调试命令组件(/tts_instruct)")
+ },
+ "probability": {
+ "enabled": ConfigField(type=bool, default=False, description="是否启用概率控制"),
+ "base_probability": ConfigField(type=float, default=1.0, description="基础触发概率"),
+ "keyword_force_trigger": ConfigField(type=bool, default=True, description="关键词强制触发"),
+ "force_keywords": ConfigField(
+ type=list,
+ default=["一定要用语音", "必须语音", "语音回复我", "务必用语音"],
+ description="强制触发关键词"
+ )
+ },
+ "ai_voice": {
+ "default_character": ConfigField(
+ type=str,
+ default="邻家小妹",
+ description="默认音色(可选:小新、猴哥、四郎、东北老妹儿、广西大表哥、妲己、霸道总裁、酥心御姐、说书先生、憨憨小弟、憨厚老哥、吕布、元气少女、文艺少女、磁性大叔、邻家小妹、低沉男声、傲娇少女、爹系男友、暖心姐姐、温柔妹妹、书香少女)"
+ )
+ },
+ "gsv2p": {
+ "api_url": ConfigField(
+ type=str, default="https://gsv2p.acgnai.top/v1/audio/speech",
+ description="GSV2P API地址"
+ ),
+ "api_token": ConfigField(type=str, default="", description="API认证Token"),
+ "default_voice": ConfigField(type=str, default="原神-中文-派蒙_ZH", description="默认音色"),
+ "timeout": ConfigField(type=int, default=120, description="API请求超时(秒)"),
+ "model": ConfigField(type=str, default="tts-v4", description="TTS模型"),
+ "response_format": ConfigField(type=str, default="wav", description="音频格式"),
+ "speed": ConfigField(type=float, default=1.0, description="语音速度")
+ },
+ "gpt_sovits": {
+ "server": ConfigField(
+ type=str, default="http://127.0.0.1:9880",
+ description="GPT-SoVITS服务地址"
+ ),
+ "styles": ConfigField(
+ type=list,
+ default=[
+ {
+ "name": "default",
+ "refer_wav": "",
+ "prompt_text": "",
+ "prompt_language": "zh",
+ "gpt_weights": "",
+ "sovits_weights": ""
+ }
+ ],
+ description="语音风格配置",
+ item_type="object",
+ item_fields={
+ "name": {"type": "string", "label": "风格名称", "required": True},
+ "refer_wav": {"type": "string", "label": "参考音频路径", "required": True},
+ "prompt_text": {"type": "string", "label": "参考文本", "required": True},
+ "prompt_language": {"type": "string", "label": "参考语言", "default": "zh"},
+ "gpt_weights": {"type": "string", "label": "GPT模型权重路径(可选)", "required": False},
+ "sovits_weights": {"type": "string", "label": "SoVITS模型权重路径(可选)", "required": False}
+ }
+ )
+ },
+ "doubao": {
+ "api_url": ConfigField(
+ type=str,
+ default="https://openspeech.bytedance.com/api/v3/tts/unidirectional",
+ description="豆包语音API地址"
+ ),
+ "app_id": ConfigField(type=str, default="", description="豆包APP ID"),
+ "access_key": ConfigField(type=str, default="", description="豆包Access Key"),
+ "resource_id": ConfigField(type=str, default="seed-tts-2.0", description="豆包Resource ID"),
+ "default_voice": ConfigField(
+ type=str, default="zh_female_vv_uranus_bigtts",
+ description="默认音色"
+ ),
+ "timeout": ConfigField(type=int, default=60, description="API请求超时(秒)"),
+ "audio_format": ConfigField(type=str, default="wav", description="音频格式"),
+ "sample_rate": ConfigField(type=int, default=24000, description="采样率"),
+ "bitrate": ConfigField(type=int, default=128000, description="比特率"),
+ "speed": ConfigField(type=float, default=None, description="语音速度(可选)"),
+ "volume": ConfigField(type=float, default=None, description="音量(可选)"),
+ "context_texts": ConfigField(
+ type=list, default=None,
+ description="上下文辅助文本(可选,仅豆包2.0模型)"
+ )
+ },
+ "cosyvoice": {
+ "gradio_url": ConfigField(
+ type=str,
+ default="https://funaudiollm-fun-cosyvoice3-0-5b.ms.show/",
+ description="Gradio API地址"
+ ),
+ "default_mode": ConfigField(
+ type=str,
+ default="3s极速复刻",
+ description="推理模式(3s极速复刻/自然语言控制)"
+ ),
+ "default_instruct": ConfigField(
+ type=str,
+ default="You are a helpful assistant. 请用广东话表达。<|endofprompt|>",
+ description="默认指令(用于自然语言控制模式)"
+ ),
+ "reference_audio": ConfigField(
+ type=str,
+ default="",
+ description="参考音频路径(用于3s极速复刻模式)"
+ ),
+ "prompt_text": ConfigField(
+ type=str,
+ default="",
+ description="提示文本(用于3s极速复刻模式)"
+ ),
+ "timeout": ConfigField(type=int, default=300, description="API请求超时(秒)"),
+ "audio_format": ConfigField(type=str, default="wav", description="音频格式")
+ },
+ "comfyui": {
+ "server": ConfigField(
+ type=str,
+ default="http://127.0.0.1:8188",
+ description="ComfyUI 服务地址(示例: http://127.0.0.1:8188)",
+ ),
+ "input_dir": ConfigField(
+ type=str,
+ default="/Users/xenon/Downloads/seiun_tts/ComfyUI/ComfyUI/input",
+ description="ComfyUI input 目录(用于放参考音频,LoadAudio 会从这里读)",
+ ),
+ "timeout": ConfigField(type=int, default=120, description="ComfyUI 请求超时(秒)"),
+ "audio_quality": ConfigField(
+ type=str,
+ default="128k",
+ description="输出 MP3 质量(SaveAudioMP3 quality: V0/128k/320k)",
+ ),
+ "mlx_python": ConfigField(
+ type=str,
+ default="/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/.venv/bin/python",
+ description="MLX Qwen3-TTS venv python 路径(用于 ComfyUI-MLX 节点子进程)",
+ ),
+ "mlx_cli": ConfigField(
+ type=str,
+ default="/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/mlx_voice_clone_cli.py",
+ description="mlx_voice_clone_cli.py 路径",
+ ),
+ "default_style": ConfigField(type=str, default="default", description="默认风格名称"),
+ "voiceclone_default_style": ConfigField(
+ type=str,
+ default="",
+ description="VoiceClone 专用默认风格名称(用于 comfyui_voiceclone 后端;留空则回退到 default_style)",
+ ),
+ "customvoice_default_style": ConfigField(
+ type=str,
+ default="",
+ description="CustomVoice 专用默认风格名称(用于 comfyui_customvoice 后端;留空则回退到 default_style)",
+ ),
+ "auto_instruct_enabled": ConfigField(
+ type=bool,
+ default=False,
+ description="是否启用 CustomVoice instruct 自动推断(使用 MaiBot 的 LLM 接口)",
+ ),
+ "auto_instruct_max_chars": ConfigField(
+ type=int,
+ default=120,
+ description="自动推断 instruct 的最大长度(字符)。建议 80-160,太短会导致情绪/表演提示被截断。",
+ ),
+ "auto_instruct_prompt": ConfigField(
+ type=str,
+ default="",
+ description="自定义 instruct 推断 prompt(留空使用内置模板)",
+ ),
+ "auto_instruct_base_tone": ConfigField(
+ type=str,
+ default="",
+ description="自动推断 instruct 时固定附加的基调描述(会作为 `基调=...;` 前缀插入;会自动清洗为单行,且不会包含 `;`/`=`)",
+ ),
+ "pause_linebreak": ConfigField(type=float, default=0.0, description="换行停顿(秒)"),
+ "period_pause": ConfigField(type=float, default=0.0, description="句号停顿(秒)"),
+ "comma_pause": ConfigField(type=float, default=0.0, description="逗号停顿(秒)"),
+ "question_pause": ConfigField(type=float, default=0.0, description="问号停顿(秒)"),
+ "hyphen_pause": ConfigField(type=float, default=0.0, description="连字符停顿(秒)"),
+ "styles": ConfigField(
+ type=list,
+ default=[
+ {
+ "name": "default",
+ "refer_wav": "",
+ "prompt_text": "",
+ "language": "",
+ "model_choice": "1.7B",
+ "precision": "bf16",
+ "seed": 0,
+ "max_new_tokens": 2048,
+ "top_p": 0.8,
+ "top_k": 20,
+ "temperature": 1.0,
+ "repetition_penalty": 1.05,
+ }
+ ],
+ description="ComfyUI VoiceClone 风格配置(参考音频+逐字稿)",
+ item_type="object",
+ item_fields={
+ "name": {"type": "string", "label": "风格名称", "required": True},
+ "mode": {"type": "string", "label": "模式(voice_clone/custom_voice)", "required": False},
+ "refer_wav": {"type": "string", "label": "参考音频路径", "required": True},
+ "prompt_text": {"type": "string", "label": "参考文本(逐字稿)", "required": True},
+ "language": {"type": "string", "label": "语言(可选: Auto/Chinese/English/...) ", "required": False},
+ "model_choice": {"type": "string", "label": "模型(0.6B/1.7B)", "required": False},
+ "precision": {"type": "string", "label": "精度(bf16/fp32)", "required": False},
+ "model_path": {"type": "string", "label": "CustomVoice模型路径", "required": False},
+ "speaker": {"type": "string", "label": "CustomVoice说话人", "required": False},
+ "instruct": {"type": "string", "label": "CustomVoice指令(或__AUTO__)", "required": False},
+ "auto_instruct": {"type": "boolean", "label": "按style启用auto_instruct", "required": False},
+ "speed": {"type": "number", "label": "speed", "required": False},
+ "seed": {"type": "number", "label": "seed", "required": False},
+ "max_new_tokens": {"type": "number", "label": "max_new_tokens", "required": False},
+ "top_p": {"type": "number", "label": "top_p", "required": False},
+ "top_k": {"type": "number", "label": "top_k", "required": False},
+ "temperature": {"type": "number", "label": "temperature", "required": False},
+ "repetition_penalty": {"type": "number", "label": "repetition_penalty", "required": False},
+ },
+ ),
+ }
+ }
+
+ def get_plugin_components(self) -> List[Tuple[ComponentInfo, Type]]:
+ """返回插件组件列表"""
+ components = []
+
+ try:
+ action_enabled = self.get_config(ConfigKeys.COMPONENTS_ACTION_ENABLED, True)
+ command_enabled = self.get_config(ConfigKeys.COMPONENTS_COMMAND_ENABLED, True)
+ instruct_enabled = self.get_config(ConfigKeys.COMPONENTS_INSTRUCT_COMMAND_ENABLED, True)
+ except AttributeError:
+ action_enabled = True
+ command_enabled = True
+ instruct_enabled = True
+
+ if action_enabled:
+ components.append((UnifiedTTSAction.get_action_info(), UnifiedTTSAction))
+
+ if command_enabled:
+ components.append((UnifiedTTSCommand.get_command_info(), UnifiedTTSCommand))
+
+ if instruct_enabled:
+ components.append((TTSInstructCommand.get_command_info(), TTSInstructCommand))
+
+ return components
diff --git a/plugins/tts_voice_plugin/test.wav b/plugins/tts_voice_plugin/test.wav
new file mode 100644
index 00000000..37550701
Binary files /dev/null and b/plugins/tts_voice_plugin/test.wav differ
diff --git a/plugins/tts_voice_plugin/utils/__init__.py b/plugins/tts_voice_plugin/utils/__init__.py
new file mode 100644
index 00000000..1c0e5cd4
--- /dev/null
+++ b/plugins/tts_voice_plugin/utils/__init__.py
@@ -0,0 +1,12 @@
+"""
+TTS工具模块
+"""
+
+import sys
+sys.dont_write_bytecode = True
+
+from .text import TTSTextUtils
+from .session import TTSSessionManager
+from .file import TTSFileManager
+
+__all__ = ["TTSTextUtils", "TTSSessionManager", "TTSFileManager"]
diff --git a/plugins/tts_voice_plugin/utils/file.py b/plugins/tts_voice_plugin/utils/file.py
new file mode 100644
index 00000000..c56469a6
--- /dev/null
+++ b/plugins/tts_voice_plugin/utils/file.py
@@ -0,0 +1,280 @@
+"""
+文件操作工具类
+提供异步文件操作、临时文件管理等功能
+"""
+
+import os
+import uuid
+import tempfile
+import asyncio
+import base64
+from typing import Optional
+from src.common.logger import get_logger
+
+logger = get_logger("tts_file_manager")
+
+# 音频数据最小有效大小(字节)
+MIN_AUDIO_SIZE = 100
+
+
+class TTSFileManager:
+ """
+ TTS文件管理器
+
+ 提供:
+ - 临时文件创建(避免并发冲突)
+ - 异步文件写入
+ - 自动清理
+ - 相对路径和绝对路径支持
+ """
+
+ # 临时文件目录(兼容旧代码)
+ _temp_dir: Optional[str] = None
+
+ # 项目根目录(用于解析相对路径)
+ _project_root: Optional[str] = None
+
+ @classmethod
+ def set_project_root(cls, root_path: str):
+ """设置项目根目录"""
+ if os.path.isdir(root_path):
+ cls._project_root = root_path
+ logger.debug(f"设置项目根目录: {root_path}")
+ else:
+ logger.warning(f"项目根目录不存在: {root_path}")
+
+ @classmethod
+ def get_project_root(cls) -> str:
+ """获取项目根目录"""
+ if cls._project_root is None:
+ # 尝试从当前文件位置推断项目根目录
+ current_file = os.path.abspath(__file__)
+ # 假设结构是: project_root/plugins/tts_voice_plugin/utils/file.py
+ cls._project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_file))))
+ logger.debug(f"自动推断项目根目录: {cls._project_root}")
+ return cls._project_root
+
+ @classmethod
+ def resolve_path(cls, path: str) -> str:
+ """
+ 解析路径(支持相对路径和绝对路径)
+
+ Args:
+ path: 路径字符串
+
+ Returns:
+ 解析后的绝对路径
+ """
+ if os.path.isabs(path):
+ # 已经是绝对路径
+ return path
+ else:
+ # 相对路径,相对于项目根目录
+ return os.path.join(cls.get_project_root(), path)
+
+ @classmethod
+ def ensure_dir(cls, dir_path: str) -> bool:
+ """
+ 确保目录存在,不存在则创建
+
+ Args:
+ dir_path: 目录路径
+
+ Returns:
+ 是否成功
+ """
+ try:
+ os.makedirs(dir_path, exist_ok=True)
+ return True
+ except Exception as e:
+ logger.error(f"创建目录失败: {dir_path}, 错误: {e}")
+ return False
+
+ @classmethod
+ def get_temp_dir(cls) -> str:
+ """
+ 获取临时文件目录(已废弃,保留兼容性)
+
+ Returns:
+ 临时目录路径
+ """
+ if cls._temp_dir is None:
+ cls._temp_dir = tempfile.gettempdir()
+ return cls._temp_dir
+
+ @classmethod
+ def set_temp_dir(cls, path: str):
+ """
+ 设置临时文件目录(已废弃,保留兼容性)
+
+ Args:
+ path: 目录路径
+ """
+ if os.path.isdir(path):
+ cls._temp_dir = path
+ else:
+ raise ValueError(f"目录不存在: {path}")
+
+ @classmethod
+ def generate_temp_path(cls, prefix: str = "tts", suffix: str = ".mp3", output_dir: str = "") -> str:
+ """
+ 生成唯一的临时文件路径
+
+ Args:
+ prefix: 文件名前缀
+ suffix: 文件扩展名
+ output_dir: 输出目录(支持相对路径和绝对路径,留空使用项目根目录)
+
+ Returns:
+ 临时文件的绝对路径
+ """
+ # 确定输出目录
+ if not output_dir:
+ # 默认使用项目根目录
+ resolved_dir = cls.get_project_root()
+ else:
+ # 解析用户配置的路径
+ resolved_dir = cls.resolve_path(output_dir)
+ # 确保目录存在
+ if not cls.ensure_dir(resolved_dir):
+ # 如果创建失败,降级到项目根目录
+ logger.warning(f"无法创建输出目录 {resolved_dir},使用项目根目录")
+ resolved_dir = cls.get_project_root()
+
+ # 生成唯一文件名
+ unique_id = uuid.uuid4().hex[:12]
+ filename = f"{prefix}_{unique_id}{suffix}"
+ return os.path.join(resolved_dir, filename)
+
+ @classmethod
+ async def write_audio_async(cls, path: str, data: bytes) -> bool:
+ """
+ 异步写入音频数据到文件
+
+ Args:
+ path: 文件路径
+ data: 音频二进制数据
+
+ Returns:
+ 是否写入成功
+ """
+ try:
+ # 使用线程池执行同步文件写入,避免阻塞事件循环
+ loop = asyncio.get_event_loop()
+ await loop.run_in_executor(None, cls._write_file_sync, path, data)
+ logger.debug(f"音频文件写入成功: {path} ({len(data)} bytes)")
+ return True
+ except IOError as e:
+ logger.error(f"写入音频文件失败: {path}, 错误: {e}")
+ return False
+ except Exception as e:
+ logger.error(f"写入音频文件时发生未知错误: {path}, 错误: {e}")
+ return False
+
+ @staticmethod
+ def _write_file_sync(path: str, data: bytes):
+ """同步写入文件(内部方法)"""
+ with open(path, "wb") as f:
+ f.write(data)
+
+ @classmethod
+ def write_audio_sync(cls, path: str, data: bytes) -> bool:
+ """
+ 同步写入音频数据到文件
+
+ Args:
+ path: 文件路径
+ data: 音频二进制数据
+
+ Returns:
+ 是否写入成功
+ """
+ try:
+ cls._write_file_sync(path, data)
+ logger.debug(f"音频文件写入成功: {path} ({len(data)} bytes)")
+ return True
+ except IOError as e:
+ logger.error(f"写入音频文件失败: {path}, 错误: {e}")
+ return False
+ except Exception as e:
+ logger.error(f"写入音频文件时发生未知错误: {path}, 错误: {e}")
+ return False
+
+ @classmethod
+ def cleanup_file(cls, path: str, silent: bool = True) -> bool:
+ """
+ 清理临时文件
+
+ Args:
+ path: 文件路径
+ silent: 是否静默处理错误
+
+ Returns:
+ 是否清理成功
+ """
+ try:
+ if path and os.path.exists(path):
+ os.remove(path)
+ logger.debug(f"临时文件已清理: {path}")
+ return True
+ return False
+ except Exception as e:
+ if not silent:
+ logger.warning(f"清理临时文件失败: {path}, 错误: {e}")
+ return False
+
+ @classmethod
+ async def cleanup_file_async(cls, path: str, delay: float = 0) -> bool:
+ """
+ 异步清理临时文件(可延迟)
+
+ Args:
+ path: 文件路径
+ delay: 延迟秒数
+
+ Returns:
+ 是否清理成功
+ """
+ if delay > 0:
+ await asyncio.sleep(delay)
+ loop = asyncio.get_event_loop()
+ return await loop.run_in_executor(None, cls.cleanup_file, path, True)
+
+ @classmethod
+ def validate_audio_data(cls, data: bytes, min_size: int = None) -> tuple:
+ """
+ 验证音频数据有效性
+
+ Args:
+ data: 音频二进制数据
+ min_size: 最小有效大小
+
+ Returns:
+ (is_valid, error_message)
+ """
+ if data is None:
+ return False, "音频数据为空"
+
+ min_size = min_size or MIN_AUDIO_SIZE
+
+ if len(data) < min_size:
+ return False, f"音频数据过小({len(data)}字节 < {min_size}字节)"
+
+ return True, ""
+
+ @classmethod
+ def audio_to_base64(cls, data: bytes) -> str:
+ """
+ 将音频数据转换为base64字符串
+
+ Args:
+ data: 音频二进制数据
+
+ Returns:
+ base64编码的字符串
+ """
+ try:
+ return base64.b64encode(data).decode('utf-8')
+ except Exception as e:
+ logger.error(f"音频数据转base64失败: {e}")
+ return ""
diff --git a/plugins/tts_voice_plugin/utils/session.py b/plugins/tts_voice_plugin/utils/session.py
new file mode 100644
index 00000000..8535b04c
--- /dev/null
+++ b/plugins/tts_voice_plugin/utils/session.py
@@ -0,0 +1,186 @@
+"""
+HTTP Session 管理器
+提供连接池复用,避免每次请求创建新连接
+"""
+
+import asyncio
+import aiohttp
+from typing import Optional, Dict, Any
+from contextlib import asynccontextmanager
+from src.common.logger import get_logger
+
+logger = get_logger("tts_session_manager")
+
+
+class TTSSessionManager:
+ """
+ TTS HTTP Session 管理器
+
+ 提供:
+ - 连接池复用
+ - 自动超时管理
+ - 优雅关闭
+ """
+
+ _instance: Optional["TTSSessionManager"] = None
+ _lock = asyncio.Lock()
+
+ def __init__(self):
+ self._sessions: Dict[str, aiohttp.ClientSession] = {}
+ self._default_timeout = 60
+
+ @classmethod
+ async def get_instance(cls) -> "TTSSessionManager":
+ """获取单例实例"""
+ if cls._instance is None:
+ async with cls._lock:
+ if cls._instance is None:
+ cls._instance = cls()
+ return cls._instance
+
+ async def get_session(
+ self,
+ backend_name: str = "default",
+ timeout: int = None
+ ) -> aiohttp.ClientSession:
+ """
+ 获取或创建 HTTP Session
+
+ Args:
+ backend_name: 后端名称,用于区分不同的session
+ timeout: 超时时间(秒)
+
+ Returns:
+ aiohttp.ClientSession 实例
+ """
+ if backend_name not in self._sessions or self._sessions[backend_name].closed:
+ timeout_val = timeout or self._default_timeout
+ connector = aiohttp.TCPConnector(
+ limit=10, # 每个主机最大连接数
+ limit_per_host=5,
+ ttl_dns_cache=300, # DNS缓存5分钟
+ force_close=True, # 禁用连接复用,修复GSV2P等API的兼容性问题
+ )
+ self._sessions[backend_name] = aiohttp.ClientSession(
+ connector=connector,
+ timeout=aiohttp.ClientTimeout(total=timeout_val)
+ )
+ logger.debug(f"创建新的HTTP Session: {backend_name}")
+
+ return self._sessions[backend_name]
+
+ async def close_session(self, backend_name: str = None):
+ """
+ 关闭指定或所有 Session
+
+ Args:
+ backend_name: 后端名称,为None时关闭所有
+ """
+ if backend_name:
+ if backend_name in self._sessions:
+ await self._sessions[backend_name].close()
+ del self._sessions[backend_name]
+ logger.debug(f"关闭HTTP Session: {backend_name}")
+ else:
+ for name, session in self._sessions.items():
+ if not session.closed:
+ await session.close()
+ logger.debug(f"关闭HTTP Session: {name}")
+ self._sessions.clear()
+
+ @asynccontextmanager
+ async def post(
+ self,
+ url: str,
+ json: Dict[str, Any] = None,
+ headers: Dict[str, str] = None,
+ data: Any = None,
+ backend_name: str = "default",
+ timeout: int = None
+ ):
+ """
+ 发送POST请求(异步上下文管理器)
+
+ Args:
+ url: 请求URL
+ json: JSON请求体
+ headers: 请求头
+ data: 表单数据
+ backend_name: 后端名称
+ timeout: 超时时间
+
+ Yields:
+ aiohttp.ClientResponse
+
+ Usage:
+ async with session_manager.post(url, json=data) as response:
+ ...
+ """
+ session = await self.get_session(backend_name, timeout)
+
+ # 如果指定了不同的超时时间,创建新的超时对象
+ req_timeout = None
+ if timeout:
+ req_timeout = aiohttp.ClientTimeout(total=timeout)
+
+ response = await session.post(
+ url,
+ json=json,
+ headers=headers,
+ data=data,
+ timeout=req_timeout
+ )
+ try:
+ yield response
+ finally:
+ response.release()
+
+ @asynccontextmanager
+ async def get(
+ self,
+ url: str,
+ headers: Dict[str, str] = None,
+ params: Dict[str, Any] = None,
+ backend_name: str = "default",
+ timeout: int = None
+ ):
+ """
+ 发送GET请求(异步上下文管理器)
+
+ Args:
+ url: 请求URL
+ headers: 请求头
+ params: URL参数
+ backend_name: 后端名称
+ timeout: 超时时间
+
+ Yields:
+ aiohttp.ClientResponse
+
+ Usage:
+ async with session_manager.get(url) as response:
+ ...
+ """
+ session = await self.get_session(backend_name, timeout)
+
+ # 如果指定了不同的超时时间,创建新的超时对象
+ req_timeout = None
+ if timeout:
+ req_timeout = aiohttp.ClientTimeout(total=timeout)
+
+ response = await session.get(
+ url,
+ headers=headers,
+ params=params,
+ timeout=req_timeout
+ )
+ try:
+ yield response
+ finally:
+ response.release()
+
+ async def __aenter__(self):
+ return self
+
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
+ await self.close_session()
diff --git a/plugins/tts_voice_plugin/utils/text.py b/plugins/tts_voice_plugin/utils/text.py
new file mode 100644
index 00000000..93524c08
--- /dev/null
+++ b/plugins/tts_voice_plugin/utils/text.py
@@ -0,0 +1,181 @@
+"""
+文本处理工具类
+"""
+
+import re
+from typing import Optional, List
+
+
+class TTSTextUtils:
+ """TTS文本处理工具类"""
+
+ # 网络用语替换映射
+ NETWORK_SLANG_MAP = {
+ 'www': '哈哈哈',
+ 'hhh': '哈哈',
+ '233': '哈哈',
+ '666': '厉害',
+ '88': '拜拜',
+ '...': '。',
+ '……': '。'
+ }
+
+ # 需要移除的特殊字符正则
+ SPECIAL_CHAR_PATTERN = re.compile(
+ r'[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ffa-zA-Z0-9\s,。!?、;:()【】"\'.,!?;:()\[\]`-]'
+ )
+
+ # 语言检测正则
+ CHINESE_PATTERN = re.compile(r'[\u4e00-\u9fff]')
+ ENGLISH_PATTERN = re.compile(r'[a-zA-Z]')
+ JAPANESE_PATTERN = re.compile(r'[\u3040-\u309f\u30a0-\u30ff]')
+
+ @classmethod
+ def clean_text(cls, text: str, max_length: int = 500) -> str:
+ """
+ 清理文本,移除特殊字符,替换网络用语
+
+ Args:
+ text: 原始文本
+ max_length: 最大长度限制(此参数已不用于硬截断,仅用于参考)
+
+ Returns:
+ 清理后的文本(不会硬截断,保留完整内容以便上层决策)
+ """
+ if not text:
+ return ""
+
+ # 注释掉文本清理功能,保留原始格式
+ # 移除不支持的特殊字符
+ # text = cls.SPECIAL_CHAR_PATTERN.sub('', text)
+
+ # 替换常见网络用语
+ # for old, new in cls.NETWORK_SLANG_MAP.items():
+ # text = text.replace(old, new)
+
+ return text.strip()
+
+ @classmethod
+ def detect_language(cls, text: str) -> str:
+ """
+ 检测文本语言
+
+ Args:
+ text: 待检测文本
+
+ Returns:
+ 语言代码 (zh/ja/en)
+ """
+ if not text:
+ return "zh"
+
+ chinese_chars = len(cls.CHINESE_PATTERN.findall(text))
+ english_chars = len(cls.ENGLISH_PATTERN.findall(text))
+ japanese_chars = len(cls.JAPANESE_PATTERN.findall(text))
+ total_chars = chinese_chars + english_chars + japanese_chars
+
+ if total_chars == 0:
+ return "zh"
+
+ chinese_ratio = chinese_chars / total_chars
+ japanese_ratio = japanese_chars / total_chars
+ english_ratio = english_chars / total_chars
+
+ if chinese_ratio > 0.3:
+ return "zh"
+ elif japanese_ratio > 0.3:
+ return "ja"
+ elif english_ratio > 0.8:
+ return "en"
+ else:
+ return "zh"
+
+ @classmethod
+ def resolve_voice_alias(
+ cls,
+ voice: Optional[str],
+ alias_map: dict,
+ default: str,
+ prefix: str = ""
+ ) -> str:
+ """
+ 解析音色别名
+
+ Args:
+ voice: 用户指定的音色
+ alias_map: 别名映射表
+ default: 默认音色
+ prefix: 内部音色ID前缀(如 "lucy-voice-")
+
+ Returns:
+ 解析后的音色ID
+ """
+ if not voice:
+ voice = default
+
+ # 如果已经是内部ID格式,直接返回
+ if prefix and voice.startswith(prefix):
+ return voice
+
+ # 尝试从别名映射查找
+ if voice in alias_map:
+ return alias_map[voice]
+
+ # 尝试使用默认值的别名
+ if default in alias_map:
+ return alias_map[default]
+
+ return default
+
+ @classmethod
+ def split_sentences(cls, text: str, min_length: int = 2) -> List[str]:
+ """
+ 将文本分割成句子
+
+ Args:
+ text: 待分割文本
+ min_length: 最小句子长度,过短的句子会合并到前一句
+
+ Returns:
+ 句子列表
+ """
+ if not text:
+ return []
+
+ # 使用中英文标点分割
+ # 保留分隔符以便后续处理
+ pattern = r'([。!?!?;;])'
+ parts = re.split(pattern, text)
+
+ sentences = []
+ current = ""
+
+ for i, part in enumerate(parts):
+ if not part:
+ continue
+
+ # 如果是标点符号,附加到当前句子
+ if re.match(pattern, part):
+ current += part
+ else:
+ # 如果当前句子不为空,先保存
+ if current.strip():
+ sentences.append(current.strip())
+ current = part
+
+ # 处理最后一段
+ if current.strip():
+ sentences.append(current.strip())
+
+ # 合并过短的句子
+ if min_length > 0 and len(sentences) > 1:
+ merged = []
+ for sent in sentences:
+ if merged and len(sent) < min_length:
+ # 合并到前一句
+ merged[-1] += sent
+ else:
+ merged.append(sent)
+ sentences = merged
+
+ return sentences