commit 0873d9e68879f7a8f814f656bf0e488b6d2b64b6 Author: Seiun <140777969+lonh-jing@users.noreply.github.com> Date: Wed Feb 11 10:26:47 2026 +0800 Squashed 'plugins/tts_voice_plugin/' content from commit d14ba1bd git-subtree-dir: plugins/tts_voice_plugin git-subtree-split: d14ba1bdf00b09521a4eab8fd66ee83c64f2314c diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..f937ce2b --- /dev/null +++ b/.gitignore @@ -0,0 +1,40 @@ +# 敏感配置文件 +config.toml +config.toml.backup.* +config.toml.reset.* + +# Python缓存文件 +__pycache__/ +*.py[cod] +*$py.class +*.so + +# 虚拟环境 +venv/ +ENV/ +env/ + +# IDE配置 +.vscode/ +.idea/ +*.swp +*.swo + +# 临时文件 +*.log +*.tmp +.DS_Store + +# 生成的音频文件 +tts_*.mp3 +tts_*.wav +tts_*.ogg + +# 数据目录(包含临时音频文件) +data/ + +# 规范工作流目录 +.spec-workflow/ + +# Claude配置 +.claude/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..0ad25db4 --- /dev/null +++ b/LICENSE @@ -0,0 +1,661 @@ + GNU AFFERO GENERAL PUBLIC LICENSE + Version 3, 19 November 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU Affero General Public License is a free, copyleft license for +software and other kinds of works, specifically designed to ensure +cooperation with the community in the case of network server software. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +our General Public Licenses are intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + Developers that use our General Public Licenses protect your rights +with two steps: (1) assert copyright on the software, and (2) offer +you this License which gives you legal permission to copy, distribute +and/or modify the software. + + A secondary benefit of defending all users' freedom is that +improvements made in alternate versions of the program, if they +receive widespread use, become available for other developers to +incorporate. Many developers of free software are heartened and +encouraged by the resulting cooperation. However, in the case of +software used on network servers, this result may fail to come about. +The GNU General Public License permits making a modified version and +letting the public access it on a server without ever releasing its +source code to the public. + + The GNU Affero General Public License is designed specifically to +ensure that, in such cases, the modified source code becomes available +to the community. It requires the operator of a network server to +provide the source code of the modified version running there to the +users of that server. Therefore, public use of a modified version, on +a publicly accessible server, gives the public access to the source +code of the modified version. + + An older license, called the Affero General Public License and +published by Affero, was designed to accomplish similar goals. This is +a different license, not a version of the Affero GPL, but Affero has +released a new version of the Affero GPL which permits relicensing under +this license. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU Affero General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Remote Network Interaction; Use with the GNU General Public License. + + Notwithstanding any other provision of this License, if you modify the +Program, your modified version must prominently offer all users +interacting with it remotely through a computer network (if your version +supports such interaction) an opportunity to receive the Corresponding +Source of your version by providing access to the Corresponding Source +from a network server at no charge, through some standard or customary +means of facilitating copying of software. This Corresponding Source +shall include the Corresponding Source for any work covered by version 3 +of the GNU General Public License that is incorporated pursuant to the +following paragraph. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the work with which it is combined will remain governed by version +3 of the GNU General Public License. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU Affero General Public License from time to time. Such new versions +will be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU Affero General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU Affero General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU Affero General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published + by the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If your software can interact with users remotely through a computer +network, you should also make sure that it provides a way for users to +get its source. For example, if your program is a web application, its +interface could display a "Source" link that leads users to an archive +of the code. There are many ways you could offer source, and different +solutions will be better for different programs; see section 13 for the +specific requirements. + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU AGPL, see +. diff --git a/README.md b/README.md new file mode 100644 index 00000000..9cd5f91c --- /dev/null +++ b/README.md @@ -0,0 +1,311 @@ +# TTS 语音合成插件 + +MaiBot 的文本转语音插件,支持多种 TTS 后端。 + +## 支持的后端 + +| 后端 | 说明 | 适用场景 | +|------|------|----------| +| AI Voice | MaiCore 内置,无需配置 | 仅群聊 | +| GSV2P | 云端 API,需要 Token | 群聊/私聊 | +| GPT-SoVITS | 本地服务,需自行部署 | 群聊/私聊 | +| 豆包语音 | 火山引擎云服务,高质量 | 群聊/私聊 | +| CosyVoice | 阿里云 CosyVoice3,支持方言和声音克隆 | 群聊/私聊 | +| ComfyUI | 本地 ComfyUI 工作流 API(MLX Qwen3-TTS VoiceClone) | 群聊/私聊 | + +## 安装 + +```bash +pip install aiohttp gradio_client +``` + +## 配置 + +编辑 `config.toml`,设置默认后端: + +```toml +[general] +default_backend = "cosyvoice" # 可选:ai_voice / gsv2p / gpt_sovits / doubao / cosyvoice / comfyui +audio_output_dir = "" # 音频输出目录,留空使用项目根目录 +use_base64_audio = false # 是否使用base64发送(备选方案) +split_sentences = true # 是否分段发送语音(长文本逐句发送) +split_delay = 0.3 # 分段发送间隔时间(秒) +send_error_messages = true # 是否发送错误提示消息(false=静默失败) +``` + +### Docker环境配置说明 + +**问题:** Docker环境中可能遇到音频上传失败或文件路径识别错误(如`识别URL失败`) + +**解决方案(按推荐顺序):** + +#### 方案1:使用相对路径(推荐) + +```toml +[general] +audio_output_dir = "" # 留空,默认使用项目根目录 +``` + +音频文件将保存在项目根目录,OneBot/NapCat可以正确识别相对路径。 + +#### 方案2:自定义输出目录 + +```toml +[general] +audio_output_dir = "data/tts_audio" # 相对路径,相对于项目根目录 +# 或 +audio_output_dir = "/app/data/audio" # 绝对路径 +``` + +#### 方案3:使用base64编码(备选) + +如果路径方案都不生效,可启用base64发送: + +```toml +[general] +use_base64_audio = true # 使用base64编码发送(会增加约33%数据大小) +``` + +### 豆包语音配置 + +```toml +[doubao] +app_id = "你的APP_ID" +access_key = "你的ACCESS_KEY" +resource_id = "seed-tts-2.0" +default_voice = "zh_female_vv_uranus_bigtts" +``` + +**预置音色:** + +| 音色名称 | voice_type | +|----------|------------| +| vivi 2.0 | zh_female_vv_uranus_bigtts | +| 大壹 | zh_male_dayi_saturn_bigtts | +| 黑猫侦探社咪仔 | zh_female_mizai_saturn_bigtts | + +**复刻音色:** 将 `resource_id` 改为 `seed-icl-2.0`,`default_voice` 填音色 ID(如 `S_xxxxxx`) + +凭证获取:[火山引擎控制台](https://console.volcengine.com/speech/service/8) + +### GSV2P 配置 + +```toml +[gsv2p] +api_token = "你的Token" +default_voice = "原神-中文-派蒙_ZH" +``` + +Token 获取:[https://tts.acgnai.top](https://tts.acgnai.top) + +### AI Voice 配置 + +```toml +[ai_voice] +default_character = "温柔妹妹" +``` + +可用音色:小新、猴哥、妲己、酥心御姐、温柔妹妹、邻家小妹 等 22 种 + +### GPT-SoVITS 配置 + +**支持两种配置格式:** + +#### 格式1:数组格式(推荐,WebUI 友好) + +```toml +[gpt_sovits] +server = "http://127.0.0.1:9880" + +[[gpt_sovits.styles]] +name = "default" +refer_wav = "/path/to/reference.wav" +prompt_text = "参考文本" +prompt_language = "zh" +gpt_weights = "/path/to/model.ckpt" # 可选:动态模型切换 +sovits_weights = "/path/to/model.pth" # 可选:动态模型切换 + +[[gpt_sovits.styles]] +name = "happy" +refer_wav = "/path/to/happy.wav" +prompt_text = "开心的参考文本" +prompt_language = "zh" +``` + +#### 格式2:字典格式(兼容旧版) + +```toml +[gpt_sovits] +server = "http://127.0.0.1:9880" + +[gpt_sovits.styles.default] +refer_wav = "/path/to/reference.wav" +prompt_text = "参考文本" +prompt_language = "zh" +gpt_weights = "/path/to/model.ckpt" +sovits_weights = "/path/to/model.pth" +``` + +> **提示:** 插件会自动识别并兼容两种格式,推荐使用数组格式以获得更好的 WebUI 支持。 + +### CosyVoice 配置 + +```toml +[cosyvoice] +gradio_url = "https://funaudiollm-fun-cosyvoice3-0-5b.ms.show/" +default_mode = "3s极速复刻" # 或 "自然语言控制" +default_instruct = "You are a helpful assistant. 请用广东话表达。<|endofprompt|>" # 只有自然语言控制模式才会生效,3s极速复刻模式下不生效 +reference_audio = "/path/to/ref.wav" # 参考音频路径 +prompt_text = "参考音频对应的文本" # 参考音频的对应文本 +timeout = 300 # API超时(秒) +``` + +**支持的方言/情感/语速:** + +| 类型 | 可用选项 | +|------|----------| +| 方言 | 广东话、东北话、四川话、上海话、闽南话、山东话、陕西话、湖南话等17种 | +| 情感 | 开心、伤心、生气 | +| 语速 | 慢速、快速 | +| 音量 | 大声、小声 | +| 特殊风格 | 小猪佩奇、机器人 | + +**推理模式:** +- `3s极速复刻`:需要提供参考音频进行声音克隆 +- `自然语言控制`:通过指令控制方言、情感、语速等 + +## 使用方法 + +### 命令触发 + +``` +/tts 你好世界 # 使用默认后端 +/tts 今天天气不错 小新 # 指定音色 +/gsv2p 你好世界 # 使用 GSV2P +/doubao 你好世界 # 使用豆包 +/cosyvoice 你好世界 四川话 # 使用 CosyVoice,四川话 +/comfyui 你好世界 -v default # 使用 ComfyUI 本地工作流(MLX VoiceClone) +``` + +## ComfyUI 后端配置 + +该后端通过 ComfyUI 的 HTTP API 执行工作流(`/prompt` -> `/history` -> `/view`),并用 `LoadAudio` 从 ComfyUI 的 `input` 目录读取参考音频。 + +```toml +[comfyui] +server = "http://127.0.0.1:8188" +input_dir = "/Users/xenon/Downloads/seiun_tts/ComfyUI/ComfyUI/input" +timeout = 120 +audio_quality = "128k" # SaveAudioMP3: V0/128k/320k +mlx_python = "/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/.venv/bin/python" +mlx_cli = "/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/mlx_voice_clone_cli.py" +default_style = "default" + +[[comfyui.styles]] +name = "default" +refer_wav = "/path/to/ref.wav" +prompt_text = "参考音频逐字稿" +language = "Auto" # 可选: Auto/Chinese/English/Japanese... +model_choice = "1.7B" +precision = "bf16" +seed = 0 +max_new_tokens = 2048 +top_p = 0.8 +top_k = 20 +temperature = 1.0 +repetition_penalty = 1.05 +``` + +### 自动触发 + +LLM 判断需要语音回复时会自动触发,可通过概率控制: + +```toml +[probability] +enabled = false # 默认关闭,每次都触发语音 +base_probability = 0.3 # 启用时 30% 概率触发 +``` + +### 智能分割插件支持 + +本插件已适配智能分割插件,支持使用 `|||SPLIT|||` 分隔符进行精确分段: + +- **优先级**:智能分割标记 > 自动句子分割 > 单句发送 +- **使用方式**:智能分割插件会在适当位置插入 `|||SPLIT|||` 标记,本插件自动识别并按标记分段发送 +- **示例**:`今天天气不错|||SPLIT|||适合出去玩|||SPLIT|||你觉得呢` 会分成三段语音依次发送 + +## 项目结构 + +``` +tts_voice_plugin/ +├── plugin.py # 插件入口 +├── config.toml # 配置文件 +├── backends/ # 后端实现 +│ ├── ai_voice.py +│ ├── gsv2p.py +│ ├── gpt_sovits.py +│ ├── doubao.py +│ └── cosyvoice.py +└── utils/ # 工具函数 +``` + +## 常见问题 + +**Q: Docker环境中提示"文件处理失败 识别URL失败"?** +A: 留空 `audio_output_dir` 配置项,插件将使用项目根目录保存音频(相对路径)。如仍有问题,可设置 `use_base64_audio = true` 使用base64编码发送。 + +**Q: AI Voice 提示"仅支持群聊"?** +A: AI Voice 只能在群聊使用,私聊会自动切换到其他后端。 + +**Q: 豆包语音怎么获取凭证?** +A: 登录火山引擎控制台,开通语音合成服务获取。 + +**Q: 文本太长被截断?** +A: 修改 `config.toml` 中 `max_text_length = 1000` + +**Q: 语音合成失败时不想让Bot发送错误消息?** +A: 设置 `send_error_messages = false`,语音合成失败时将静默处理,不向用户发送错误提示。 + +## 更新日志 + +### v3.2.3 +- 修复豆包语音 WAV 流式响应合并问题(正确处理 LIST/INFO 元数据块和多 header 情况) +- 默认后端改为 CosyVoice(更稳定的声音克隆体验) +- 默认关闭概率控制(每次触发都生成语音,更可预期的行为) +- 优化 LLM 长度约束提示(利用"近因效应"提高遵守率) +- 优化 action 记录格式,帮助 planner 避免重复执行 +- GSV2P/豆包音频格式默认改为 WAV(更好的兼容性) +- CosyVoice 默认模式改为 3s 极速复刻(更快响应) +- 更新默认超时配置(CosyVoice 300s, GSV2P 120s) + +### v3.2.2 +- 适配智能分割插件(支持 `|||SPLIT|||` 分隔符精确分段) +- GPT-SoVITS 支持数组格式配置(WebUI 友好,向后兼容字典格式) +- 修复豆包语音音色信息显示乱码问题 +- 优化配置文件注释,更简洁清晰 +- 优化分段发送逻辑优先级(智能分割 > 自动分割 > 单句) +- 禁用 Python 字节码生成(保持目录干净) +- 添加插件 ID 标识字段 + +### v3.2.1 +- 新增 `send_error_messages` 配置项(可选择关闭错误提示消息) +- 统一错误消息处理逻辑(通过 `_send_error` 方法) + +### v3.2.0 +- 新增 CosyVoice 后端(阿里云 ModelScope,支持 17 种方言、3 秒声音克隆) +- 新增分段发送功能(长文本自动分割逐句发送) +- GPT-SoVITS 支持动态模型切换(在风格配置中指定 gpt_weights/sovits_weights) +- GSV2P 后端新增重试机制(5 次重试,3 秒间隔) +- 新增 `/cosyvoice` 命令 +- 新增 gradio_client 依赖 + +### v3.1.0 +- 新增豆包语音后端(火山引擎云服务) +- 重构为模块化架构 +- HTTP Session 复用优化 + +## 信息 + +- 版本:3.2.3 +- 作者:靓仔 +- 许可:AGPL-v3.0 diff --git a/_manifest.json b/_manifest.json new file mode 100644 index 00000000..d640b6a3 --- /dev/null +++ b/_manifest.json @@ -0,0 +1,235 @@ +{ + "manifest_version": 1, + "name": "统一TTS语音合成插件", + "version": "3.2.3", + "description": "统一TTS语音合成插件,整合AI Voice、GSV2P、GPT-SoVITS、豆包语音、CosyVoice五种后端引擎,提供灵活的语音合成能力。", + "author": { + "name": "靓仔", + "url": "https://github.com/xuqian13" + }, + "license": "AGPL-v3.0", + "homepage_url": "", + "repository_url": "https://github.com/xuqian13/tts_voice_plugin", + "keywords": [ + "TTS", + "语音合成", + "文本转语音", + "AI语音", + "GSV2P", + "GPT-SoVITS", + "豆包", + "CosyVoice", + "火山引擎", + "多后端", + "语音", + "朗读", + "音色", + "语音播报", + "方言", + "声音克隆", + "MaiCore" + ], + "categories": [ + "语音", + "AI", + "聊天增强", + "娱乐", + "Utility", + "Communication", + "Accessibility" + ], + "host_application": { + "min_version": "0.12.0" + }, + "default_locale": "zh-CN", + "plugin_info": { + "is_built_in": false, + "plugin_type": "general", + "components": [ + { + "type": "action", + "name": "unified_tts_action", + "description": "统一TTS语音合成Action,支持四种后端引擎智能切换,LLM自主判断触发" + }, + { + "type": "command", + "name": "unified_tts_command", + "description": "统一TTS命令,支持/tts、/voice、/gsv2p、/doubao多种命令格式,灵活指定后端和音色" + } + ], + "features": [ + "支持五种TTS后端:AI Voice、GSV2P、GPT-SoVITS、豆包语音、CosyVoice", + "AI Voice: MaiCore内置,简单快速,22+预设音色", + "GSV2P: 云端API,高质量合成,丰富的调节参数", + "GPT-SoVITS: 本地服务,高度定制化,多风格支持", + "豆包语音: 字节跳动云服务,支持复刻音色和情感控制", + "CosyVoice: 阿里云语音合成,支持17种方言、3秒声音克隆、情感控制", + "模块化架构,后端独立实现,易于扩展", + "HTTP Session复用,提升性能", + "临时文件自动清理,避免并发冲突", + "智能触发模式(LLM自主判断)和手动命令模式", + "概率控制机制,避免语音回复过于频繁", + "智能语言检测(中文/英文/日文)", + "文本自动清理和网络用语转换", + "完善的错误处理和重试机制", + "灵活的配置系统,支持各后端独立配置" + ], + "dependencies": { + "python": [ + "aiohttp", + "gradio_client" + ], + "system": [], + "plugins": [] + }, + "backend_info": { + "ai_voice": { + "provider": "MaiCore内置", + "endpoint": "AI_VOICE_SEND命令", + "authentication": "无需认证", + "limitations": "仅支持群聊使用", + "voices": "22+预设音色(小新、妲己、酥心御姐等)" + }, + "gsv2p": { + "provider": "GSV2P云服务", + "endpoint": "https://gsv2p.acgnai.top/v1/audio/speech", + "authentication": "需要API Token", + "limitations": "API调用限制", + "features": "高质量合成、多语言支持、丰富参数调节" + }, + "gpt_sovits": { + "provider": "本地GPT-SoVITS服务", + "endpoint": "http://127.0.0.1:9880", + "authentication": "无需认证", + "limitations": "需要本地部署服务", + "features": "高度定制化、多风格支持、模型权重切换" + }, + "doubao": { + "provider": "字节跳动火山引擎", + "endpoint": "https://openspeech.bytedance.com/api/v3/tts/unidirectional", + "authentication": "需要app_id、access_key、resource_id", + "limitations": "需要火山引擎账号", + "features": "快速高质量、支持复刻音色、情感语气控制" + }, + "cosyvoice": { + "provider": "阿里云 CosyVoice", + "endpoint": "ModelScope Gradio API", + "authentication": "无需认证(公开Gradio接口)", + "limitations": "依赖ModelScope服务可用性", + "features": "3秒声音克隆、17种方言支持、情感语速控制、自然语言指令" + } + } + }, + "configuration": { + "config_file": "config.toml", + "config_template": "config.toml.example", + "auto_generate": true, + "sections": [ + { + "name": "plugin", + "description": "插件基本配置" + }, + { + "name": "general", + "description": "通用设置(默认后端、超时、文本长度等)" + }, + { + "name": "components", + "description": "组件启用控制" + }, + { + "name": "probability", + "description": "概率控制配置(避免语音回复过于频繁)" + }, + { + "name": "ai_voice", + "description": "AI Voice后端配置(音色映射等)" + }, + { + "name": "gsv2p", + "description": "GSV2P后端配置(API地址、Token、参数等)" + }, + { + "name": "gpt_sovits", + "description": "GPT-SoVITS后端配置(服务地址、风格配置等)" + }, + { + "name": "doubao", + "description": "豆包语音后端配置(火山引擎认证、音色、情感等)" + }, + { + "name": "cosyvoice", + "description": "CosyVoice后端配置(Gradio URL、模式、方言等)" + } + ] + }, + "usage_examples": [ + { + "type": "action", + "backend": "auto", + "description": "LLM自动触发语音回复", + "example": "用户:请用语音说\"你好世界\"\n机器人:[使用默认后端自动生成语音文件并发送]" + }, + { + "type": "command", + "backend": "ai_voice", + "description": "手动命令使用AI Voice", + "example": "/tts 你好世界 小新" + }, + { + "type": "command", + "backend": "gsv2p", + "description": "手动命令使用GSV2P", + "example": "/gsv2p 今天天气不错" + }, + { + "type": "command", + "backend": "doubao", + "description": "手动命令使用豆包语音", + "example": "/doubao 你好世界" + }, + { + "type": "command", + "backend": "gpt_sovits", + "description": "手动命令使用GPT-SoVITS", + "example": "/tts 测试一下 default gpt_sovits" + }, + { + "type": "command", + "backend": "cosyvoice", + "description": "手动命令使用CosyVoice", + "example": "/cosyvoice 你好世界 四川话" + }, + { + "type": "command", + "backend": "auto", + "description": "使用默认后端", + "example": "/tts 你好世界" + } + ], + "migration_info": { + "from_plugins": [ + "ai_voice_plugin (v1.0.0)", + "gsv2p_tts_plugin (v1.0.0)", + "tts_voice_plugin (v2.0.0)", + "tts_voice_plugin (v3.0.0)" + ], + "migration_notes": [ + "本插件整合了ai_voice_plugin、gsv2p_tts_plugin和旧版tts_voice_plugin的所有功能", + "v3.2.2适配智能分割插件(支持|||SPLIT|||分隔符精确分段)", + "v3.2.2支持GPT-SoVITS数组格式配置(WebUI友好,向后兼容字典格式)", + "v3.2.2修复豆包语音音色信息显示乱码问题", + "v3.2.2优化配置文件注释,更简洁清晰", + "v3.2.0新增CosyVoice后端支持(阿里云语音合成,支持17种方言和3秒声音克隆)", + "v3.1.0新增豆包语音后端支持", + "v3.1.0重构为模块化架构,提升代码可维护性", + "配置文件需要重新生成,原配置需手动迁移", + "建议备份旧插件配置后再迁移", + "AI Voice音色映射保持兼容", + "GSV2P API配置需重新填写Token", + "GPT-SoVITS风格配置需要重新设置", + "新增config.toml.example模板文件" + ] + }, + "id": "tts_voice_plugin" +} \ No newline at end of file diff --git a/backends/__init__.py b/backends/__init__.py new file mode 100644 index 00000000..ddcafef1 --- /dev/null +++ b/backends/__init__.py @@ -0,0 +1,38 @@ +""" +TTS后端模块 +""" + +import sys +sys.dont_write_bytecode = True + +from .base import TTSBackendBase, TTSBackendRegistry, TTSResult +from .ai_voice import AIVoiceBackend +from .gsv2p import GSV2PBackend +from .gpt_sovits import GPTSoVITSBackend +from .doubao import DoubaoBackend +from .cosyvoice import CosyVoiceBackend +from .comfyui import ComfyUIBackend, ComfyUIVoiceCloneBackend, ComfyUICustomVoiceBackend + +# 注册后端 +TTSBackendRegistry.register("ai_voice", AIVoiceBackend) +TTSBackendRegistry.register("gsv2p", GSV2PBackend) +TTSBackendRegistry.register("gpt_sovits", GPTSoVITSBackend) +TTSBackendRegistry.register("doubao", DoubaoBackend) +TTSBackendRegistry.register("cosyvoice", CosyVoiceBackend) +TTSBackendRegistry.register("comfyui", ComfyUIBackend) +TTSBackendRegistry.register("comfyui_voiceclone", ComfyUIVoiceCloneBackend) +TTSBackendRegistry.register("comfyui_customvoice", ComfyUICustomVoiceBackend) + +__all__ = [ + "TTSBackendBase", + "TTSBackendRegistry", + "TTSResult", + "AIVoiceBackend", + "GSV2PBackend", + "GPTSoVITSBackend", + "DoubaoBackend", + "CosyVoiceBackend", + "ComfyUIBackend", + "ComfyUIVoiceCloneBackend", + "ComfyUICustomVoiceBackend", +] diff --git a/backends/ai_voice.py b/backends/ai_voice.py new file mode 100644 index 00000000..c916fa00 --- /dev/null +++ b/backends/ai_voice.py @@ -0,0 +1,133 @@ +""" +AI Voice 后端实现 +使用 MaiCore 内置的 AI 语音功能 +""" + +from typing import Optional, Callable, Dict +from .base import TTSBackendBase, TTSResult +from ..utils.text import TTSTextUtils +from ..config_keys import ConfigKeys +from src.common.logger import get_logger + +logger = get_logger("tts_ai_voice") + +# AI Voice 音色映射表 +AI_VOICE_ALIAS_MAP = { + "小新": "lucy-voice-laibixiaoxin", + "猴哥": "lucy-voice-houge", + "四郎": "lucy-voice-silang", + "东北老妹儿": "lucy-voice-guangdong-f1", + "广西大表哥": "lucy-voice-guangxi-m1", + "妲己": "lucy-voice-daji", + "霸道总裁": "lucy-voice-lizeyan", + "酥心御姐": "lucy-voice-suxinjiejie", + "说书先生": "lucy-voice-m8", + "憨憨小弟": "lucy-voice-male1", + "憨厚老哥": "lucy-voice-male3", + "吕布": "lucy-voice-lvbu", + "元气少女": "lucy-voice-xueling", + "文艺少女": "lucy-voice-f37", + "磁性大叔": "lucy-voice-male2", + "邻家小妹": "lucy-voice-female1", + "低沉男声": "lucy-voice-m14", + "傲娇少女": "lucy-voice-f38", + "爹系男友": "lucy-voice-m101", + "暖心姐姐": "lucy-voice-female2", + "温柔妹妹": "lucy-voice-f36", + "书香少女": "lucy-voice-f34" +} + + +class AIVoiceBackend(TTSBackendBase): + """ + AI Voice 后端 + + 使用 MaiCore 内置的 AI 语音功能 + 注意:仅支持群聊环境 + """ + + backend_name = "ai_voice" + backend_description = "MaiCore内置AI语音(仅群聊)" + support_private_chat = False # 不支持私聊 + default_audio_format = "" # AI Voice不需要音频格式 + + def __init__(self, config_getter, log_prefix: str = ""): + super().__init__(config_getter, log_prefix) + self._send_command = None # 由外部注入 + + def set_send_command(self, send_command_func: Callable) -> None: + """设置发送命令的函数(由Action/Command注入)""" + self._send_command = send_command_func + + def get_default_voice(self) -> str: + """获取默认音色""" + return self.get_config(ConfigKeys.AI_VOICE_DEFAULT_CHARACTER, "温柔妹妹") + + def resolve_voice(self, voice: Optional[str]) -> str: + """解析音色别名""" + alias_map: Dict[str, str] = self.get_config( + ConfigKeys.AI_VOICE_ALIAS_MAP, + AI_VOICE_ALIAS_MAP + ) + default_voice = self.get_default_voice() + return TTSTextUtils.resolve_voice_alias( + voice, + alias_map, + default_voice, + prefix="lucy-voice-" + ) + + async def execute( + self, + text: str, + voice: Optional[str] = None, + **kwargs + ) -> TTSResult: + """ + 执行AI Voice语音合成 + + Args: + text: 待转换的文本 + voice: 音色名称或别名 + + Returns: + TTSResult + """ + if not self._send_command: + return TTSResult( + success=False, + message="AI Voice后端未正确初始化(缺少send_command)", + backend_name=self.backend_name + ) + + # 解析音色 + character = self.resolve_voice(voice) + + try: + success = await self._send_command( + command_name="AI_VOICE_SEND", + args={"text": text, "character": character}, + storage_message=False + ) + + if success: + logger.info(f"{self.log_prefix} AI语音发送成功 (音色: {character})") + return TTSResult( + success=True, + message=f"成功发送AI语音 (音色: {character})", + backend_name=self.backend_name + ) + else: + return TTSResult( + success=False, + message="AI语音命令发送失败", + backend_name=self.backend_name + ) + + except Exception as e: + logger.error(f"{self.log_prefix} AI语音执行错误: {e}") + return TTSResult( + success=False, + message=f"AI语音执行错误: {e}", + backend_name=self.backend_name + ) diff --git a/backends/base.py b/backends/base.py new file mode 100644 index 00000000..9d8936f4 --- /dev/null +++ b/backends/base.py @@ -0,0 +1,239 @@ +""" +TTS后端抽象基类和注册表 +""" + +import asyncio +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Dict, Type, Optional, Any, Callable, Tuple, Union +from src.common.logger import get_logger +from ..config_keys import ConfigKeys + +logger = get_logger("tts_backend") + + +@dataclass +class TTSResult: + """TTS执行结果""" + success: bool + message: str + audio_path: Optional[str] = None + backend_name: str = "" + + def __iter__(self): + """支持解包为 (success, message)""" + return iter((self.success, self.message)) + + +class TTSBackendBase(ABC): + """ + TTS后端抽象基类 + + 所有TTS后端必须继承此类并实现 execute 方法 + """ + + # 后端名称(子类必须覆盖) + backend_name: str = "base" + + # 后端描述 + backend_description: str = "TTS后端基类" + + # 是否支持私聊 + support_private_chat: bool = True + + # 默认音频格式 + default_audio_format: str = "mp3" + + def __init__(self, config_getter: Callable[[str, Any], Any], log_prefix: str = ""): + """ + 初始化后端 + + Args: + config_getter: 配置获取函数,签名为 get_config(key, default) + log_prefix: 日志前缀 + """ + self.get_config = config_getter + self.log_prefix = log_prefix or f"[{self.backend_name}]" + self._send_custom = None + + def set_send_custom(self, send_custom_func: Callable) -> None: + """设置发送自定义消息的函数""" + self._send_custom = send_custom_func + + async def send_audio( + self, + audio_data: bytes, + audio_format: str = "mp3", + prefix: str = "tts", + voice_info: str = "" + ) -> TTSResult: + """ + 统一的音频发送方法 + + Args: + audio_data: 音频二进制数据 + audio_format: 音频格式(如mp3、wav) + prefix: 文件名前缀 + voice_info: 音色信息(用于日志) + + Returns: + TTSResult + """ + from ..utils.file import TTSFileManager + + # 检查是否使用base64发送 + use_base64 = self.get_config(ConfigKeys.GENERAL_USE_BASE64_AUDIO, False) + logger.debug(f"{self.log_prefix} 开始发送音频 (原始大小: {len(audio_data)}字节, 格式: {audio_format})") + + if use_base64: + # 使用base64编码发送 + base64_audio = TTSFileManager.audio_to_base64(audio_data) + if not base64_audio: + return TTSResult(False, "音频数据转base64失败", backend_name=self.backend_name) + + logger.debug(f"{self.log_prefix} base64编码完成,准备通过send_custom发送") + if self._send_custom: + await self._send_custom(message_type="voice", content=base64_audio) + logger.info(f"{self.log_prefix} 语音已通过send_custom发送 (base64模式, 音频大小: {len(audio_data)}字节)") + else: + logger.warning(f"{self.log_prefix} send_custom未设置,无法发送语音") + return TTSResult(False, "send_custom回调未设置", backend_name=self.backend_name) + + return TTSResult( + success=True, + message=f"成功发送{self.backend_name}语音{(' ('+voice_info+')') if voice_info else ''}, base64模式", + backend_name=self.backend_name + ) + else: + # 使用文件路径发送 + output_dir = self.get_config(ConfigKeys.GENERAL_AUDIO_OUTPUT_DIR, "") + audio_path = TTSFileManager.generate_temp_path( + prefix=prefix, + suffix=f".{audio_format}", + output_dir=output_dir + ) + + if not await TTSFileManager.write_audio_async(audio_path, audio_data): + return TTSResult(False, "保存音频文件失败", backend_name=self.backend_name) + + logger.debug(f"{self.log_prefix} 音频文件已保存, 路径: {audio_path}") + # 发送语音 + if self._send_custom: + await self._send_custom(message_type="voiceurl", content=audio_path) + logger.info(f"{self.log_prefix} 语音已通过send_custom发送 (文件路径模式, 路径: {audio_path})") + # 延迟清理临时文件 + asyncio.create_task(TTSFileManager.cleanup_file_async(audio_path, delay=30)) + else: + logger.warning(f"{self.log_prefix} send_custom未设置,无法发送语音") + return TTSResult(False, "send_custom回调未设置", backend_name=self.backend_name) + + return TTSResult( + success=True, + message=f"成功发送{self.backend_name}语音{(' ('+voice_info+')') if voice_info else ''}", + audio_path=audio_path, + backend_name=self.backend_name + ) + + @abstractmethod + async def execute( + self, + text: str, + voice: Optional[str] = None, + **kwargs + ) -> TTSResult: + """ + 执行TTS转换 + + Args: + text: 待转换的文本 + voice: 音色/风格 + **kwargs: 其他参数(如emotion等) + + Returns: + TTSResult 包含执行结果 + """ + raise NotImplementedError + + def validate_config(self) -> Tuple[bool, str]: + """ + 验证后端配置是否完整 + + Returns: + (is_valid, error_message) + """ + return True, "" + + def get_default_voice(self) -> str: + """获取默认音色""" + return "" + + def is_available(self) -> bool: + """检查后端是否可用""" + is_valid, _ = self.validate_config() + return is_valid + + +class TTSBackendRegistry: + """ + TTS后端注册表 + + 使用策略模式 + 工厂模式管理后端 + """ + + _backends: Dict[str, Type[TTSBackendBase]] = {} + + @classmethod + def register(cls, name: str, backend_class: Type[TTSBackendBase]) -> None: + """ + 注册后端 + + Args: + name: 后端名称 + backend_class: 后端类 + """ + cls._backends[name] = backend_class + logger.debug(f"注册TTS后端: {name}") + + @classmethod + def unregister(cls, name: str) -> None: + """注销后端""" + if name in cls._backends: + del cls._backends[name] + + @classmethod + def get(cls, name: str) -> Optional[Type[TTSBackendBase]]: + """获取后端类""" + return cls._backends.get(name) + + @classmethod + def create( + cls, + name: str, + config_getter: Callable[[str, Any], Any], + log_prefix: str = "" + ) -> Optional[TTSBackendBase]: + """ + 创建后端实例 + + Args: + name: 后端名称 + config_getter: 配置获取函数 + log_prefix: 日志前缀 + + Returns: + 后端实例或None + """ + backend_class = cls.get(name) + if backend_class: + return backend_class(config_getter, log_prefix) + return None + + @classmethod + def list_backends(cls) -> list[str]: + """列出所有已注册的后端名称""" + return list(cls._backends.keys()) + + @classmethod + def is_registered(cls, name: str) -> bool: + """检查后端是否已注册""" + return name in cls._backends diff --git a/backends/comfyui.py b/backends/comfyui.py new file mode 100644 index 00000000..d574e9fe --- /dev/null +++ b/backends/comfyui.py @@ -0,0 +1,827 @@ +""" +ComfyUI backend (Workflow API). + +This backend calls a fixed ComfyUI prompt graph that: +LoadAudio -> MLX_Qwen3TTSVoiceClone -> SaveAudioMP3 + +Rationale: +- ComfyUI expects API-format "prompt" graphs (not UI workflow json). +- For audio inputs, the simplest reliable path is to copy the reference audio into ComfyUI/input + and use the built-in LoadAudio node. +""" + +from __future__ import annotations + +import asyncio +import hashlib +import os +import re +import time +import uuid +from typing import Any, ClassVar, Dict, Optional, Tuple +from urllib.parse import urlencode + +from src.common.logger import get_logger +from src.plugin_system.apis import generator_api + +from .base import TTSBackendBase, TTSResult +from ..config_keys import ConfigKeys +from ..utils.file import TTSFileManager +from ..utils.session import TTSSessionManager +from ..utils.text import TTSTextUtils + +logger = get_logger("tts_comfyui") + + +LANG_TO_DEMO = { + "zh": "Chinese", + "ja": "Japanese", + "en": "English", +} + + +class ComfyUIBackend(TTSBackendBase): + backend_name = "comfyui" + backend_description = "ComfyUI 工作流 API(MLX Qwen3-TTS VoiceClone/CustomVoice)" + support_private_chat = True + default_audio_format = "mp3" + + _ref_cache: ClassVar[Dict[str, str]] = {} + _instruct_cache: ClassVar[Dict[str, str]] = {} + # If set by subclasses, only these modes are allowed (e.g. {"voice_clone"}). + allowed_modes: ClassVar[Optional[set[str]]] = None + + def get_default_voice(self) -> str: + return self.get_config(ConfigKeys.COMFYUI_DEFAULT_STYLE, "default") + + def _filter_styles_by_mode(self, styles: Dict[str, Any]) -> Dict[str, Any]: + allowed = self.allowed_modes + if not allowed: + return styles + out: Dict[str, Any] = {} + for name, st in (styles or {}).items(): + if not isinstance(st, dict): + continue + mode = str(st.get("mode") or "voice_clone").strip() + if mode in allowed: + out[name] = st + return out + + def _normalize_styles_config(self, styles_config: Any) -> Dict[str, Any]: + # Match GPT-SoVITS backend style schema: list[{name,...}] or dict{name:{...}} + if isinstance(styles_config, dict): + return styles_config + if isinstance(styles_config, list): + result = {} + for style in styles_config: + if isinstance(style, dict) and "name" in style: + name = style["name"] + result[name] = {k: v for k, v in style.items() if k != "name"} + return result + return {} + + def _clean_instruct(self, s: str, max_chars: int) -> str: + s = (s or "").strip() + if not s: + return "" + + # Strip common wrappers. + s = s.replace("```", "").strip() + s = re.sub(r"^instruct\\s*[::]\\s*", "", s, flags=re.IGNORECASE).strip() + + # Prefer first non-empty line. + for line in s.splitlines(): + line = line.strip() + if line: + s = line + break + + # Trim quotes. + if len(s) >= 2 and ((s[0] == s[-1] == '"') or (s[0] == s[-1] == "'")): + s = s[1:-1].strip() + + if max_chars and len(s) > max_chars: + s = s[:max_chars].rstrip() + return s + + def _clean_base_tone(self, s: str) -> str: + """ + Clean a base tone/persona string so it can safely live inside `基调=...`: + - single-line + - no semicolons (they are field separators) + - no '=' (KV separator) + """ + s = (s or "").strip() + if not s: + return "" + s = s.replace("\r", " ").replace("\n", " ") + s = re.sub(r"\\s+", " ", s).strip() + # Avoid breaking KV parsing. + s = s.replace(";", ",").replace(";", ",") + s = s.replace("=", " ").replace("=", " ") + return s.strip(" ,,") + + def _attach_base_tone(self, instruct: str, max_chars: int) -> str: + """ + If configured, prefix inferred instruct with a fixed base tone/persona: + `基调=<...>;情绪=...;语速=...;停顿=...` + + Priority when trimming: keep the inferred instruct fields intact if possible. + """ + base_raw = self.get_config(ConfigKeys.COMFYUI_AUTO_INSTRUCT_BASE_TONE, "") or "" + base = self._clean_base_tone(str(base_raw)) + if not base: + return (instruct or "").strip() + + s = (instruct or "").strip() + fields = self._parse_instruct_fields(s) + if "基调" in fields: + return s + + prefix = f"基调={base}" + if not s: + return prefix[:max_chars].rstrip() if max_chars else prefix + + combined = f"{prefix};{s}" + if not max_chars or len(combined) <= max_chars: + return combined + + # Too long: try trimming base first, keeping inferred instruct intact. + remain = max_chars - len(s) - len(";") - len("基调=") + if remain <= 0: + # Can't fit base at all; keep instruct (already max_chars-limited upstream). + return s[:max_chars].rstrip() + base_trim = base[:remain].rstrip(" ,,") + return f"基调={base_trim};{s}" + + def _parse_instruct_fields(self, instruct: str) -> Dict[str, str]: + """ + Parse a 1-line instruct like: + 情绪=愤怒;语速=很快;停顿=很少;表现=咬牙切齿 + + We only *use* a few keys (情绪/语速/停顿/强度/表现...), but keep it generic. + """ + s = (instruct or "").strip() + if not s: + return {} + + # Normalize separators (full-width punctuation). + s = s.replace(";", ";").replace(":", ":").replace("=", "=") + + # Split by semicolon/comma-like separators. + parts = [p.strip() for p in re.split(r"[;]+", s) if p.strip()] + out: Dict[str, str] = {} + for p in parts: + if "=" not in p: + continue + k, v = p.split("=", 1) + k = k.strip() + v = v.strip() + if not k or not v: + continue + # Limit key length to avoid garbage. + if len(k) > 8: + continue + out[k] = v + return out + + def _map_speed_label(self, label: str) -> Optional[float]: + lab = (label or "").strip() + m = { + "很慢": 0.85, + "稍慢": 0.93, + "正常": 1.00, + "稍快": 1.07, + "很快": 1.15, + } + return m.get(lab) + + def _map_pause_label(self, label: str) -> Optional[float]: + lab = (label or "").strip() + m = { + "很少": 0.6, + "自然": 1.0, + "稍多": 1.3, + "很多": 1.7, + } + return m.get(lab) + + def _ensure_base_pause_cfg(self, pause_cfg: Dict[str, float]) -> Dict[str, float]: + # If caller didn't configure pauses (all zeros), apply a conservative base so "停顿" can take effect. + keys = ["pause_linebreak", "period_pause", "comma_pause", "question_pause", "hyphen_pause"] + if all(float(pause_cfg.get(k, 0.0) or 0.0) == 0.0 for k in keys): + return { + **pause_cfg, + "pause_linebreak": 0.18, + "period_pause": 0.22, + "comma_pause": 0.10, + "question_pause": 0.20, + "hyphen_pause": 0.06, + } + return pause_cfg + + def _enrich_instruct_for_emotion(self, instruct: str, max_chars: int) -> str: + """ + Add short performance cues for common emotions, keeping it single-line KV style. + This helps when the model under-reacts to simple labels like "愤怒". + """ + s = (instruct or "").strip() + if not s: + return "" + + fields = self._parse_instruct_fields(s) + emo = fields.get("情绪", "") + if not emo: + return s + + # Only add if it doesn't already contain a "表现=" field. + if "表现" in fields: + return s + + emo_norm = emo + cues = "" + if "愤怒" in emo_norm or "生气" in emo_norm: + cues = "声压高,咬字重,重音强,尾音下压" + elif "开心" in emo_norm or "高兴" in emo_norm: + cues = "笑意明显,轻快上扬,尾音明亮" + elif "悲伤" in emo_norm or "难过" in emo_norm: + cues = "气声略多,音量偏低,语尾下沉" + elif "温柔" in emo_norm: + cues = "音量轻,气声柔,语尾轻收" + elif "冷淡" in emo_norm or "冷静" in emo_norm: + cues = "平直克制,少起伏,干净收尾" + + if not cues: + return s + + extra = f";表现={cues}" + if max_chars and len(s) + len(extra) > max_chars: + # Trim cues to fit. + allow = max_chars - len(s) - len(";表现=") + if allow <= 0: + return s[:max_chars].rstrip() + cues = cues[:allow].rstrip(",, ") + extra = f";表现={cues}" + return (s + extra)[:max_chars].rstrip() if max_chars else (s + extra) + + def _apply_instruct_controls( + self, instruct: str, speed: float, pause_cfg: Dict[str, float], max_chars: int + ) -> Tuple[str, float, Dict[str, float]]: + """ + If instruct contains '语速'/'停顿', map them to real synthesis controls. + This makes auto_instruct meaningfully affect output even if the model is insensitive to labels. + """ + s = (instruct or "").strip() + if not s: + return "", speed, pause_cfg + + fields = self._parse_instruct_fields(s) + speed_label = fields.get("语速", "") + pause_label = fields.get("停顿", "") + + out_speed = float(speed) + mapped_speed = self._map_speed_label(speed_label) + if mapped_speed is not None: + out_speed = mapped_speed + + out_pause_cfg = dict(pause_cfg or {}) + mapped_pause = self._map_pause_label(pause_label) + if mapped_pause is not None: + out_pause_cfg = self._ensure_base_pause_cfg(out_pause_cfg) + for k in ["pause_linebreak", "period_pause", "comma_pause", "question_pause", "hyphen_pause"]: + try: + out_pause_cfg[k] = float(out_pause_cfg.get(k, 0.0) or 0.0) * float(mapped_pause) + except Exception: + pass + + # Add short performance cues (kept within max_chars). + s = self._enrich_instruct_for_emotion(s, max_chars=max_chars) + return s, out_speed, out_pause_cfg + + async def _infer_instruct( + self, + text: str, + detected_lang: str, + chat_stream=None, + chat_id: Optional[str] = None, + style_name: str = "", + ) -> str: + """ + Infer a short CustomVoice `instruct` string from the target text via MaiBot's LLM interface. + """ + enabled = bool(self.get_config(ConfigKeys.COMFYUI_AUTO_INSTRUCT_ENABLED, False)) + if not enabled: + return "" + + max_chars = int(self.get_config(ConfigKeys.COMFYUI_AUTO_INSTRUCT_MAX_CHARS, 40) or 40) + + # Default prompt: output ONE short instruct line only. + default_tpl = ( + "你是配音导演。请根据要朗读的文本生成一行 TTS instruct。\\n" + "硬性要求:必须同时包含【情绪】【语速】【停顿】三项。可以额外补充 1-2 个表演提示(如 音量/重音/音高/表现)。\\n" + "只输出一行,不要解释,不要复述原文,不要引号/代码块。\\n" + "输出格式固定为:情绪=<...>;语速=<...>;停顿=<...>\\n" + "语速可选:很慢/稍慢/正常/稍快/很快。\\n" + "停顿可选:很少/自然/稍多/很多。\\n" + "长度<= {max_chars} 字。\\n" + "文本语言: {lang}\\n" + "待朗读文本: {text}\\n" + ) + prompt_tpl = str(self.get_config(ConfigKeys.COMFYUI_AUTO_INSTRUCT_PROMPT, default_tpl) or "") + if not prompt_tpl.strip(): + prompt_tpl = default_tpl + + # Cache key should change if prompt/base_tone/max_chars changes. + base_raw = str(self.get_config(ConfigKeys.COMFYUI_AUTO_INSTRUCT_BASE_TONE, "") or "") + cfg_sig_src = f"{max_chars}\\n{prompt_tpl}\\n{base_raw}" + cfg_sig = hashlib.sha256(cfg_sig_src.encode("utf-8")).hexdigest()[:12] + text_sig = hashlib.sha256(text.encode("utf-8")).hexdigest()[:16] + cache_key = f"{cfg_sig}:{detected_lang}:{text_sig}" + cached = self._instruct_cache.get(cache_key) + if cached: + return cached + + lang = detected_lang or "auto" + prompt = prompt_tpl.format(text=text.strip(), lang=lang, max_chars=max_chars) + + try: + resp = await generator_api.generate_tts_instruct( + prompt=prompt, + request_type="tts_instruct", + ) + instruct = self._clean_instruct(resp or "", max_chars=max_chars) + instruct = self._attach_base_tone(instruct, max_chars=max_chars) + if instruct: + self._instruct_cache[cache_key] = instruct + return instruct + except Exception as e: + logger.warning(f"{self.log_prefix} auto_instruct 失败(style={style_name}): {e}") + return "" + + def validate_config(self) -> Tuple[bool, str]: + server = self.get_config(ConfigKeys.COMFYUI_SERVER, "http://127.0.0.1:8188") + if not server: + return False, "ComfyUI 未配置 server" + + input_dir = self.get_config( + ConfigKeys.COMFYUI_INPUT_DIR, + "/Users/xenon/Downloads/seiun_tts/ComfyUI/ComfyUI/input", + ) + if not input_dir: + return False, "ComfyUI 未配置 input_dir" + + styles_raw = self.get_config(ConfigKeys.COMFYUI_STYLES, {}) + styles = self._normalize_styles_config(styles_raw) + if not styles: + return False, "ComfyUI 后端未配置任何风格(至少需要配置 1 个 style)" + + default_name = self.get_default_voice() or "default" + if default_name not in styles: + # Fallback to "default" if present. + if "default" in styles: + default_name = "default" + else: + return False, f"ComfyUI default_style='{default_name}' 不存在" + + st = styles.get(default_name, {}) + mode = (st.get("mode") or "voice_clone").strip() + if mode == "voice_clone": + if not st.get("refer_wav") or not st.get("prompt_text"): + return False, f"ComfyUI 风格 '{default_name}' 配置不完整(voice_clone 需要 refer_wav 和 prompt_text)" + elif mode == "custom_voice": + if not st.get("model_path") or not st.get("speaker"): + return False, f"ComfyUI 风格 '{default_name}' 配置不完整(custom_voice 需要 model_path 和 speaker)" + else: + return False, f"ComfyUI 风格 '{default_name}' mode 无效: {mode}" + + return True, "" + + def _ensure_ref_in_input(self, input_dir: str, refer_wav: str) -> str: + refer_wav = TTSFileManager.resolve_path(refer_wav) + if not os.path.exists(refer_wav): + raise FileNotFoundError(f"参考音频不存在: {refer_wav}") + + st = os.stat(refer_wav) + cache_key = f"{os.path.abspath(refer_wav)}:{st.st_mtime_ns}:{st.st_size}" + if cache_key in self._ref_cache: + name = self._ref_cache[cache_key] + if os.path.exists(os.path.join(input_dir, name)): + return name + + ext = os.path.splitext(refer_wav)[1] or ".wav" + h = hashlib.sha256(cache_key.encode("utf-8")).hexdigest()[:16] + name = f"maibot_ref_{h}{ext}" + dst = os.path.join(input_dir, name) + + os.makedirs(input_dir, exist_ok=True) + if not os.path.exists(dst): + # Keep it simple: copy file bytes. LoadAudio can decode common formats (wav/mp3). + import shutil + + shutil.copyfile(refer_wav, dst) + + self._ref_cache[cache_key] = name + return name + + def _build_prompt_voice_clone( + self, + ref_filename: str, + ref_text: str, + target_text: str, + language: str, + model_choice: str, + precision: str, + seed: int, + max_new_tokens: int, + top_p: float, + top_k: int, + temperature: float, + repetition_penalty: float, + audio_quality: str, + mlx_python: str, + mlx_cli: str, + pause_cfg: Dict[str, float], + ) -> Dict[str, Any]: + # Node IDs are arbitrary but stable in this prompt template. + # 1: LoadAudio -> outputs AUDIO + # 2: Pause config (FB_Qwen3TTSConfig) -> outputs TTS_CONFIG + # 3: MLX VoiceClone -> outputs AUDIO + # 4: SaveAudioMP3 -> outputs UI audio file info + filename_prefix = f"audio/maibot_comfyui_{int(time.time())}_{uuid.uuid4().hex[:8]}" + prompt: Dict[str, Any] = { + "1": { + "class_type": "LoadAudio", + "inputs": { + "audio": ref_filename, + }, + }, + "2": { + "class_type": "FB_Qwen3TTSConfig", + "inputs": { + "pause_linebreak": float(pause_cfg.get("pause_linebreak", 0.0)), + "period_pause": float(pause_cfg.get("period_pause", 0.0)), + "comma_pause": float(pause_cfg.get("comma_pause", 0.0)), + "question_pause": float(pause_cfg.get("question_pause", 0.0)), + "hyphen_pause": float(pause_cfg.get("hyphen_pause", 0.0)), + }, + }, + "3": { + "class_type": "MLX_Qwen3TTSVoiceClone", + "inputs": { + "target_text": target_text, + "model_choice": model_choice, + "device": "auto", + "precision": precision, + "language": language, + "ref_audio": ["1", 0], + "ref_text": ref_text, + "seed": int(seed), + "max_new_tokens": int(max_new_tokens), + "top_p": float(top_p), + "top_k": int(top_k), + "temperature": float(temperature), + "repetition_penalty": float(repetition_penalty), + "attention": "auto", + "unload_model_after_generate": False, + "config": ["2", 0], + "mlx_python": mlx_python, + "mlx_cli": mlx_cli, + }, + }, + "4": { + "class_type": "SaveAudioMP3", + "inputs": { + "audio": ["3", 0], + "filename_prefix": filename_prefix, + "quality": audio_quality, + }, + }, + } + return prompt + + def _build_prompt_custom_voice( + self, + target_text: str, + speaker: str, + model_path: str, + instruct: str, + speed: float, + language: str, + seed: int, + max_new_tokens: int, + top_p: float, + top_k: int, + temperature: float, + repetition_penalty: float, + audio_quality: str, + mlx_python: str, + mlx_cli: str, + pause_cfg: Dict[str, float], + ) -> Dict[str, Any]: + # 2: Pause config (FB_Qwen3TTSConfig) -> outputs TTS_CONFIG + # 3: MLX CustomVoice -> outputs AUDIO + # 4: SaveAudioMP3 -> outputs UI audio file info + filename_prefix = f"audio/maibot_comfyui_{int(time.time())}_{uuid.uuid4().hex[:8]}" + prompt: Dict[str, Any] = { + "2": { + "class_type": "FB_Qwen3TTSConfig", + "inputs": { + "pause_linebreak": float(pause_cfg.get("pause_linebreak", 0.0)), + "period_pause": float(pause_cfg.get("period_pause", 0.0)), + "comma_pause": float(pause_cfg.get("comma_pause", 0.0)), + "question_pause": float(pause_cfg.get("question_pause", 0.0)), + "hyphen_pause": float(pause_cfg.get("hyphen_pause", 0.0)), + }, + }, + "3": { + "class_type": "MLX_Qwen3TTSCustomVoice", + "inputs": { + "text": target_text, + "speaker": speaker, + "model_path": model_path, + "instruct": instruct or "", + "speed": float(speed), + "language": language, + "seed": int(seed), + "max_new_tokens": int(max_new_tokens), + "top_p": float(top_p), + "top_k": int(top_k), + "temperature": float(temperature), + "repetition_penalty": float(repetition_penalty), + "config": ["2", 0], + "mlx_python": mlx_python, + "mlx_cli": mlx_cli, + }, + }, + "4": { + "class_type": "SaveAudioMP3", + "inputs": { + "audio": ["3", 0], + "filename_prefix": filename_prefix, + "quality": audio_quality, + }, + }, + } + return prompt + + async def _queue_and_wait( + self, server: str, prompt: Dict[str, Any], timeout: int + ) -> Dict[str, Any]: + session_manager = await TTSSessionManager.get_instance() + prompt_id = str(uuid.uuid4()) + + post_url = f"{server.rstrip('/')}/prompt" + payload = { + "prompt": prompt, + "client_id": "maibot-tts-voice-plugin", + "prompt_id": prompt_id, + } + + async with session_manager.post( + post_url, json=payload, backend_name=self.backend_name, timeout=timeout + ) as resp: + data = await resp.json(content_type=None) + if resp.status != 200: + raise RuntimeError(f"ComfyUI /prompt 失败: {resp.status} {str(data)[:200]}") + if "error" in data: + raise RuntimeError(f"ComfyUI /prompt 返回错误: {data['error']}") + + # Poll history until prompt_id appears + hist_url = f"{server.rstrip('/')}/history/{prompt_id}" + deadline = time.time() + float(timeout) + while time.time() < deadline: + async with session_manager.get( + hist_url, backend_name=self.backend_name, timeout=timeout + ) as resp: + history = await resp.json(content_type=None) + if prompt_id in history: + return history[prompt_id] + await asyncio.sleep(0.35) + + raise TimeoutError("等待 ComfyUI 生成超时") + + async def _download_output_audio(self, server: str, history_item: Dict[str, Any], timeout: int) -> bytes: + outputs = history_item.get("outputs") or {} + node_out = outputs.get("4") or {} + audios = node_out.get("audio") or [] + if not audios: + # Some failures show up only in status/messages. + status = history_item.get("status") or {} + raise RuntimeError(f"ComfyUI 未产出音频. status={status}") + + a0 = audios[0] + filename = a0.get("filename") + subfolder = a0.get("subfolder", "") + folder_type = a0.get("type", "output") + if not filename: + raise RuntimeError(f"ComfyUI 音频输出结构异常: {a0}") + + q = urlencode({"filename": filename, "subfolder": subfolder, "type": folder_type}) + url = f"{server.rstrip('/')}/view?{q}" + + session_manager = await TTSSessionManager.get_instance() + async with session_manager.get(url, backend_name=self.backend_name, timeout=timeout) as resp: + if resp.status != 200: + txt = await resp.text() + raise RuntimeError(f"ComfyUI /view 失败: {resp.status} {txt[:200]}") + return await resp.read() + + async def execute(self, text: str, voice: Optional[str] = None, **kwargs) -> TTSResult: + is_valid, err = self.validate_config() + if not is_valid: + return TTSResult(False, err, backend_name=self.backend_name) + + if not text or not text.strip(): + return TTSResult(False, "待合成的文本为空", backend_name=self.backend_name) + + server = self.get_config(ConfigKeys.COMFYUI_SERVER, "http://127.0.0.1:8188") + input_dir = self.get_config( + ConfigKeys.COMFYUI_INPUT_DIR, + "/Users/xenon/Downloads/seiun_tts/ComfyUI/ComfyUI/input", + ) + timeout = int(self.get_config(ConfigKeys.COMFYUI_TIMEOUT, self.get_config(ConfigKeys.GENERAL_TIMEOUT, 60))) + + audio_quality = self.get_config(ConfigKeys.COMFYUI_AUDIO_QUALITY, "128k") + mlx_python = self.get_config( + ConfigKeys.COMFYUI_MLX_PYTHON, + "/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/.venv/bin/python", + ) + mlx_cli = self.get_config( + ConfigKeys.COMFYUI_MLX_CLI, + "/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/mlx_voice_clone_cli.py", + ) + + styles_raw = self.get_config(ConfigKeys.COMFYUI_STYLES, {}) + styles = self._filter_styles_by_mode(self._normalize_styles_config(styles_raw)) + + style_name = (voice or self.get_default_voice() or "").strip() or "default" + if style_name not in styles: + # For split backends (voiceclone/customvoice), make "wrong style" errors explicit. + if (voice or "").strip() and self.allowed_modes: + return TTSResult( + False, + f"ComfyUI风格 '{style_name}' 不存在或不属于当前后端({self.backend_name})", + backend_name=self.backend_name, + ) + # Fallback order: "default" -> first available style. + if "default" in styles: + style_name = "default" + elif styles: + style_name = sorted(styles.keys())[0] + else: + return TTSResult( + False, + f"ComfyUI 未配置任何风格({self.backend_name})", + backend_name=self.backend_name, + ) + style = styles.get(style_name, {}) + + mode = (style.get("mode") or "voice_clone").strip() + if mode == "voice_clone": + refer_wav = style.get("refer_wav", "") + prompt_text = style.get("prompt_text", "") + if not refer_wav or not prompt_text: + return TTSResult(False, f"ComfyUI风格 '{style_name}' 配置不完整(voice_clone)", backend_name=self.backend_name) + elif mode == "custom_voice": + model_path = style.get("model_path", "") + speaker = style.get("speaker", "") + if not model_path or not speaker: + return TTSResult(False, f"ComfyUI风格 '{style_name}' 配置不完整(custom_voice)", backend_name=self.backend_name) + else: + return TTSResult(False, f"ComfyUI风格 '{style_name}' mode 无效: {mode}", backend_name=self.backend_name) + + # Map language to the MLX node's language combo. Default to Auto. + detected = TTSTextUtils.detect_language(text) + language = style.get("language") or LANG_TO_DEMO.get(detected, "Auto") + + # Sampling defaults match the MLX node defaults we exposed. + seed = int(style.get("seed", 0) or 0) + model_choice = str(style.get("model_choice", "1.7B") or "1.7B") + precision = str(style.get("precision", "bf16") or "bf16") + max_new_tokens = int(style.get("max_new_tokens", 2048) or 2048) + top_p = float(style.get("top_p", 0.8) or 0.8) + top_k = int(style.get("top_k", 20) or 20) + temperature = float(style.get("temperature", 1.0) or 1.0) + repetition_penalty = float(style.get("repetition_penalty", 1.05) or 1.05) + + pause_cfg = { + "pause_linebreak": float(self.get_config(ConfigKeys.COMFYUI_PAUSE_LINEBREAK, 0.0)), + "period_pause": float(self.get_config(ConfigKeys.COMFYUI_PERIOD_PAUSE, 0.0)), + "comma_pause": float(self.get_config(ConfigKeys.COMFYUI_COMMA_PAUSE, 0.0)), + "question_pause": float(self.get_config(ConfigKeys.COMFYUI_QUESTION_PAUSE, 0.0)), + "hyphen_pause": float(self.get_config(ConfigKeys.COMFYUI_HYPHEN_PAUSE, 0.0)), + } + # Allow per-style override. + if isinstance(style.get("pause_cfg"), dict): + for k in pause_cfg.keys(): + if k in style["pause_cfg"]: + try: + pause_cfg[k] = float(style["pause_cfg"][k]) + except Exception: + pass + + try: + if mode == "voice_clone": + ref_filename = self._ensure_ref_in_input(input_dir, style.get("refer_wav", "")) + prompt = self._build_prompt_voice_clone( + ref_filename=ref_filename, + ref_text=style.get("prompt_text", ""), + target_text=text, + language=language, + model_choice=model_choice, + precision=precision, + seed=seed, + max_new_tokens=max_new_tokens, + top_p=top_p, + top_k=top_k, + temperature=temperature, + repetition_penalty=repetition_penalty, + audio_quality=audio_quality, + mlx_python=mlx_python, + mlx_cli=mlx_cli, + pause_cfg=pause_cfg, + ) + else: + # Allow per-style / automatic instruct inference. + instruct = str(style.get("instruct", "")).strip() + auto_style = bool(style.get("auto_instruct", False)) + inferred = "" + if instruct == "__AUTO__" or (not instruct and auto_style): + chat_stream = kwargs.get("chat_stream") + chat_id = kwargs.get("chat_id") + inferred = await self._infer_instruct( + text=text, + detected_lang=detected, + chat_stream=chat_stream, + chat_id=chat_id, + style_name=style_name, + ) + if inferred: + instruct = inferred + + # If the instruct contains usable fields, map them to real controls. + max_chars = int(self.get_config(ConfigKeys.COMFYUI_AUTO_INSTRUCT_MAX_CHARS, 40) or 40) + instruct, mapped_speed, mapped_pause_cfg = self._apply_instruct_controls( + instruct=instruct, + speed=float(style.get("speed", 1.0) or 1.0), + pause_cfg=pause_cfg, + max_chars=max_chars, + ) + + prompt = self._build_prompt_custom_voice( + target_text=text, + speaker=str(style.get("speaker", "")).strip(), + model_path=str(style.get("model_path", "")).strip(), + instruct=instruct, + speed=mapped_speed, + language=language, + seed=seed, + max_new_tokens=max_new_tokens, + top_p=top_p, + top_k=top_k, + temperature=temperature, + repetition_penalty=repetition_penalty, + audio_quality=audio_quality, + mlx_python=mlx_python, + mlx_cli=mlx_cli, + pause_cfg=mapped_pause_cfg, + ) + + logger.info(f"{self.log_prefix} ComfyUI请求: text='{text[:50]}...', style={style_name}") + history_item = await self._queue_and_wait(server, prompt, timeout=timeout) + audio_bytes = await self._download_output_audio(server, history_item, timeout=timeout) + + ok, msg = TTSFileManager.validate_audio_data(audio_bytes) + if not ok: + return TTSResult(False, f"ComfyUI 返回音频无效: {msg}", backend_name=self.backend_name) + + return await self.send_audio( + audio_data=audio_bytes, + audio_format="mp3", + prefix="tts_comfyui", + voice_info=f"style: {style_name}", + ) + except Exception as e: + return TTSResult(False, f"ComfyUI后端错误: {e}", backend_name=self.backend_name) + + +class ComfyUIVoiceCloneBackend(ComfyUIBackend): + backend_name = "comfyui_voiceclone" + backend_description = "ComfyUI 工作流 API(MLX Qwen3-TTS VoiceClone 专用)" + allowed_modes = {"voice_clone"} + + def get_default_voice(self) -> str: + v = self.get_config(ConfigKeys.COMFYUI_VOICECLONE_DEFAULT_STYLE, "") or "" + v = v.strip() + return v or super().get_default_voice() + + +class ComfyUICustomVoiceBackend(ComfyUIBackend): + backend_name = "comfyui_customvoice" + backend_description = "ComfyUI 工作流 API(MLX Qwen3-TTS CustomVoice 专用)" + allowed_modes = {"custom_voice"} + + def get_default_voice(self) -> str: + v = self.get_config(ConfigKeys.COMFYUI_CUSTOMVOICE_DEFAULT_STYLE, "") or "" + v = v.strip() + return v or super().get_default_voice() diff --git a/backends/cosyvoice.py b/backends/cosyvoice.py new file mode 100644 index 00000000..28199815 --- /dev/null +++ b/backends/cosyvoice.py @@ -0,0 +1,285 @@ +""" +CosyVoice后端实现 +使用 ModelScope 的 Fun-CosyVoice3-0.5B Gradio API 进行语音合成 +""" + +import asyncio +import os +import shutil +from typing import Optional, Tuple +from .base import TTSBackendBase, TTSResult +from ..utils.file import TTSFileManager +from ..config_keys import ConfigKeys +from src.common.logger import get_logger + +logger = get_logger("tts_cosyvoice") + +# CosyVoice指令映射表(方言、情感、语速等) +COSYVOICE_INSTRUCT_MAP = { + # 方言 + "广东话": "You are a helpful assistant. 请用广东话表达。<|endofprompt|>", + "东北话": "You are a helpful assistant. 请用东北话表达。<|endofprompt|>", + "甘肃话": "You are a helpful assistant. 请用甘肃话表达。<|endofprompt|>", + "贵州话": "You are a helpful assistant. 请用贵州话表达。<|endofprompt|>", + "河南话": "You are a helpful assistant. 请用河南话表达。<|endofprompt|>", + "湖北话": "You are a helpful assistant. 请用湖北话表达。<|endofprompt|>", + "湖南话": "You are a helpful assistant. 请用湖南话表达。<|endofprompt|>", + "江西话": "You are a helpful assistant. 请用江西话表达。<|endofprompt|>", + "闽南话": "You are a helpful assistant. 请用闽南话表达。<|endofprompt|>", + "宁夏话": "You are a helpful assistant. 请用宁夏话表达。<|endofprompt|>", + "山西话": "You are a helpful assistant. 请用山西话表达。<|endofprompt|>", + "陕西话": "You are a helpful assistant. 请用陕西话表达。<|endofprompt|>", + "山东话": "You are a helpful assistant. 请用山东话表达。<|endofprompt|>", + "上海话": "You are a helpful assistant. 请用上海话表达。<|endofprompt|>", + "四川话": "You are a helpful assistant. 请用四川话表达。<|endofprompt|>", + "天津话": "You are a helpful assistant. 请用天津话表达。<|endofprompt|>", + "云南话": "You are a helpful assistant. 请用云南话表达。<|endofprompt|>", + + # 音量 + "大声": "You are a helpful assistant. Please say a sentence as loudly as possible.<|endofprompt|>", + "小声": "You are a helpful assistant. Please say a sentence in a very soft voice.<|endofprompt|>", + + # 语速 + "慢速": "You are a helpful assistant. 请用尽可能慢地语速说一句话。<|endofprompt|>", + "快速": "You are a helpful assistant. 请用尽可能快地语速说一句话。<|endofprompt|>", + + # 情感 + "开心": "You are a helpful assistant. 请非常开心地说一句话。<|endofprompt|>", + "伤心": "You are a helpful assistant. 请非常伤心地说一句话。<|endofprompt|>", + "生气": "You are a helpful assistant. 请非常生气地说一句话。<|endofprompt|>", + + # 特殊风格 + "小猪佩奇": "You are a helpful assistant. 我想体验一下小猪佩奇风格,可以吗?<|endofprompt|>", + "机器人": "You are a helpful assistant. 你可以尝试用机器人的方式解答吗?<|endofprompt|>", +} + + +class CosyVoiceBackend(TTSBackendBase): + """ + CosyVoice语音后端 + + 使用 ModelScope 的 Fun-CosyVoice3-0.5B Gradio API 进行语音合成 + 支持3秒极速复刻、自然语言控制(方言、情感、语速等) + """ + + backend_name = "cosyvoice" + backend_description = "阿里云 CosyVoice3 API (ModelScope Gradio)" + support_private_chat = True + default_audio_format = "wav" + + def get_default_voice(self) -> str: + """获取默认音色(CosyVoice 不需要预设音色)""" + return "" + + def validate_config(self) -> Tuple[bool, str]: + """验证配置""" + gradio_url = self.get_config(ConfigKeys.COSYVOICE_GRADIO_URL, "") + + if not gradio_url: + return False, "CosyVoice后端缺少必需的 gradio_url 配置" + + return True, "" + + def _resolve_instruct(self, emotion: Optional[str]) -> str: + """ + 解析情感参数为指令文本 + + Args: + emotion: 情感/方言关键词 + + Returns: + 指令文本 + """ + if emotion and emotion in COSYVOICE_INSTRUCT_MAP: + return COSYVOICE_INSTRUCT_MAP[emotion] + + # 返回默认指令(确保不为空) + default_instruct = self.get_config( + ConfigKeys.COSYVOICE_DEFAULT_INSTRUCT, + "You are a helpful assistant. 请用广东话表达。<|endofprompt|>" + ) + + # 如果配置为空,强制使用广东话 + if not default_instruct or not default_instruct.strip(): + default_instruct = "You are a helpful assistant. 请用广东话表达。<|endofprompt|>" + + return default_instruct + + async def execute( + self, + text: str, + voice: Optional[str] = None, + emotion: Optional[str] = None, + **kwargs + ) -> TTSResult: + """ + 执行 CosyVoice 语音合成 + + Args: + text: 待转换的文本 + voice: 音色(对于CosyVoice,这个参数用于指定参考音频路径) + emotion: 情感/方言/语速参数 + + Returns: + TTSResult + """ + # 验证配置 + is_valid, error_msg = self.validate_config() + if not is_valid: + return TTSResult(False, error_msg, backend_name=self.backend_name) + + # 验证文本 + if not text or not text.strip(): + return TTSResult(False, "待合成的文本为空", backend_name=self.backend_name) + + # 获取配置 + gradio_url = self.get_config(ConfigKeys.COSYVOICE_GRADIO_URL, "") + mode_config = self.get_config(ConfigKeys.COSYVOICE_DEFAULT_MODE, "3s极速复刻") + + # mode_checkbox_group 实际上是 Radio 组件,期望字符串而不是列表 + # 处理配置可能返回字符串或列表的情况 + if isinstance(mode_config, list): + mode_str = mode_config[0] if mode_config else "3s极速复刻" + else: + mode_str = mode_config if mode_config else "3s极速复刻" + + timeout = self.get_config(ConfigKeys.COSYVOICE_TIMEOUT, 60) + reference_audio = self.get_config(ConfigKeys.COSYVOICE_REFERENCE_AUDIO, "") + prompt_text = self.get_config(ConfigKeys.COSYVOICE_PROMPT_TEXT, "") + + # CosyVoice 的"自然语言控制"模式实际上需要参考音频和 prompt_text + # 如果没有配置,使用默认的参考音频 + if not reference_audio or not os.path.exists(reference_audio): + plugin_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + default_audio = os.path.join(plugin_dir, "test.wav") + if os.path.exists(default_audio): + reference_audio = default_audio + logger.debug(f"{self.log_prefix} 使用默认参考音频: {reference_audio}") + + # 如果没有 prompt_text,使用默认文本 + if not prompt_text: + prompt_text = "大家好,我是嘉然,今天我来为大家朗读。" + logger.debug(f"{self.log_prefix} 使用默认 prompt_text") + + # voice 参数可以覆盖配置文件中的参考音频 + if voice and os.path.exists(voice): + reference_audio = voice + + # 解析指令文本 + instruct_text = self._resolve_instruct(emotion) + + logger.info( + f"{self.log_prefix} CosyVoice请求: text='{text[:50]}...' " + f"(共{len(text)}字符), mode={mode_str}, instruct={emotion or '默认'}" + ) + + try: + # 动态导入 gradio_client(避免全局依赖) + try: + from gradio_client import Client, handle_file + except ImportError: + logger.error(f"{self.log_prefix} gradio_client 未安装,请运行: pip install gradio_client") + return TTSResult( + False, + "gradio_client 未安装,请运行: pip install gradio_client", + backend_name=self.backend_name + ) + + # 创建 Gradio 客户端(设置超时) + try: + import httpx + httpx_kwargs = {"timeout": httpx.Timeout(timeout, read=timeout, write=timeout, connect=30.0)} + client = Client(gradio_url, httpx_kwargs=httpx_kwargs) + except Exception as e: + logger.warning(f"{self.log_prefix} 无法设置 httpx 超时,使用默认配置: {e}") + client = Client(gradio_url) + + # 准备参数 + logger.debug(f"{self.log_prefix} 准备参考音频: {reference_audio}") + prompt_wav_upload = handle_file(reference_audio) if reference_audio and os.path.exists(reference_audio) else None + logger.debug(f"{self.log_prefix} 参考音频准备完成") + + # 调用 API + logger.info(f"{self.log_prefix} 调用 Gradio API: {gradio_url} (超时: {timeout}秒)") + logger.debug(f"{self.log_prefix} mode参数: {mode_str} (type: {type(mode_str).__name__})") + logger.debug(f"{self.log_prefix} prompt_text: {prompt_text[:50]}...") + logger.debug(f"{self.log_prefix} instruct_text: {instruct_text[:50]}...") + + result = await asyncio.wait_for( + asyncio.to_thread( + client.predict, + tts_text=text, + mode_checkbox_group=mode_str, + prompt_text=prompt_text, + prompt_wav_upload=prompt_wav_upload, + prompt_wav_record=None, + instruct_text=instruct_text, + seed=0, + stream=False, # API 实际期望布尔值 False,虽然文档显示为 Literal['False'] + api_name="/generate_audio" + ), + timeout=timeout + ) + + logger.info(f"{self.log_prefix} CosyVoice API 响应成功") + + # result 是生成的音频文件路径 + if not result or not os.path.exists(result): + return TTSResult( + False, + f"CosyVoice 生成失败,未返回有效文件: {result}", + backend_name=self.backend_name + ) + + # 读取音频数据 + try: + with open(result, 'rb') as f: + audio_data = f.read() + except Exception as e: + logger.error(f"{self.log_prefix} 读取音频文件失败: {e}") + return TTSResult( + False, + f"读取音频文件失败: {e}", + backend_name=self.backend_name + ) + + # 验证音频数据 + is_valid, error_msg = TTSFileManager.validate_audio_data(audio_data) + if not is_valid: + logger.warning(f"{self.log_prefix} CosyVoice音频数据验证失败: {error_msg}") + return TTSResult( + False, + f"CosyVoice语音{error_msg}", + backend_name=self.backend_name + ) + + logger.debug( + f"{self.log_prefix} CosyVoice音频数据验证通过 " + f"(大小: {len(audio_data)}字节)" + ) + + # 使用统一的发送方法 + audio_format = self.get_config(ConfigKeys.COSYVOICE_AUDIO_FORMAT, "wav") + voice_info = f"模式: {mode_str}, 指令: {emotion or '默认'}" + + return await self.send_audio( + audio_data=audio_data, + audio_format=audio_format, + prefix="tts_cosyvoice", + voice_info=voice_info + ) + + except asyncio.TimeoutError: + logger.error(f"{self.log_prefix} CosyVoice API 请求超时 (配置超时: {timeout}秒)") + return TTSResult( + False, + "CosyVoice API 调用超时", + backend_name=self.backend_name + ) + except Exception as e: + logger.error(f"{self.log_prefix} CosyVoice 执行异常: {e}") + return TTSResult( + False, + f"CosyVoice 执行错误: {e}", + backend_name=self.backend_name + ) diff --git a/backends/doubao.py b/backends/doubao.py new file mode 100644 index 00000000..4c566abf --- /dev/null +++ b/backends/doubao.py @@ -0,0 +1,230 @@ +""" +豆包语音后端实现 +使用字节跳动豆包语音 API 进行语音合成 +""" + +import asyncio +import uuid +from typing import Optional, List, Dict, Tuple +from .base import TTSBackendBase, TTSResult +from .doubao_stream_parser import DoubaoStreamParser +from ..utils.file import TTSFileManager +from ..utils.session import TTSSessionManager +from ..config_keys import ConfigKeys +from src.common.logger import get_logger + +logger = get_logger("tts_doubao") + +# 豆包语音情感映射表(用于自动生成context_texts) +DOUBAO_EMOTION_MAP = { + # 积极情绪 + "开心": "你的语气再欢乐一点", + "兴奋": "用特别兴奋激动的语气说话", + "温柔": "用温柔体贴的语气说话", + "骄傲": "用骄傲的语气说话", + "自信": "用自信坚定的语气说话", + + # 消极情绪 + "生气": "你得跟我互怼!就是跟我用吵架的语气对话", + "愤怒": "用愤怒的语气说话", + "伤心": "用特别特别痛心的语气说话", + "失望": "用失望沮丧的语气说话", + "委屈": "用委屈的语气说话", + + # 中性情绪 + "平静": "用平静淡定的语气说话", + "严肃": "用严肃认真的语气说话", + "疑惑": "用疑惑不解的语气说话", + + # 语速调整 + "慢速": "说慢一点", + "快速": "说快一点", + + # 音量调整 + "小声": "你嗓门再小点", + "大声": "大声一点", +} + + +class DoubaoBackend(TTSBackendBase): + """ + 豆包语音后端 + + 使用字节跳动豆包语音 API 进行高质量语音合成 + 支持预置音色和复刻音色 + """ + + backend_name = "doubao" + backend_description = "字节跳动豆包语音API" + support_private_chat = True + default_audio_format = "mp3" + + def get_default_voice(self) -> str: + """获取默认音色""" + return self.get_config(ConfigKeys.DOUBAO_DEFAULT_VOICE, "zh_female_shuangkuaisisi_moon_bigtts") + + def validate_config(self) -> Tuple[bool, str]: + """验证配置""" + app_id = self.get_config(ConfigKeys.DOUBAO_APP_ID, "") + access_key = self.get_config(ConfigKeys.DOUBAO_ACCESS_KEY, "") + resource_id = self.get_config(ConfigKeys.DOUBAO_RESOURCE_ID, "") + + if not app_id or not access_key or not resource_id: + return False, "豆包语音后端缺少必需的认证配置(app_id/access_key/resource_id)" + + return True, "" + + def _resolve_emotion(self, emotion: Optional[str]) -> Optional[List[str]]: + """ + 解析情感参数为 context_texts + + Args: + emotion: 情感关键词 + + Returns: + context_texts 列表或 None + """ + if emotion and emotion in DOUBAO_EMOTION_MAP: + return [DOUBAO_EMOTION_MAP[emotion]] + return None + + async def execute( + self, + text: str, + voice: Optional[str] = None, + emotion: Optional[str] = None, + **kwargs + ) -> TTSResult: + """ + 执行豆包语音合成 + + Args: + text: 待转换的文本 + voice: 音色ID + emotion: 情感/语气参数 + + Returns: + TTSResult + """ + # 验证配置 + is_valid, error_msg = self.validate_config() + if not is_valid: + return TTSResult(False, error_msg, backend_name=self.backend_name) + + # 验证文本 + if not text or not text.strip(): + return TTSResult(False, "待合成的文本为空", backend_name=self.backend_name) + + # 获取配置 + api_url = self.get_config(ConfigKeys.DOUBAO_API_URL, "https://openspeech.bytedance.com/api/v3/tts/unidirectional") + app_id = self.get_config(ConfigKeys.DOUBAO_APP_ID, "") + access_key = self.get_config(ConfigKeys.DOUBAO_ACCESS_KEY, "") + resource_id = self.get_config(ConfigKeys.DOUBAO_RESOURCE_ID, "") + timeout = self.get_config(ConfigKeys.DOUBAO_TIMEOUT, 30) + + if not voice: + voice = self.get_default_voice() + + # 构建请求头 + headers = { + "Content-Type": "application/json", + "X-Api-App-Id": app_id, + "X-Api-Access-Key": access_key, + "X-Api-Resource-Id": resource_id, + "X-Api-Request-Id": str(uuid.uuid4()), + "Accept-Encoding": "gzip, deflate" + } + + # 构建请求体 + request_data: Dict[str, any] = { + "req_params": { + "text": text, + "speaker": voice, + "audio_params": { + "format": self.get_config(ConfigKeys.DOUBAO_AUDIO_FORMAT, "mp3"), + "sample_rate": self.get_config(ConfigKeys.DOUBAO_SAMPLE_RATE, 24000), + "bitrate": self.get_config(ConfigKeys.DOUBAO_BITRATE, 128000) + } + } + } + + # 添加可选参数 + speed = self.get_config(ConfigKeys.DOUBAO_SPEED, None) + if speed is not None: + request_data["req_params"]["speed"] = speed + + volume = self.get_config(ConfigKeys.DOUBAO_VOLUME, None) + if volume is not None: + request_data["req_params"]["volume"] = volume + + # 处理 context_texts + context_texts: Optional[List[str]] = None + + # 优先使用传入的emotion参数 + if emotion: + context_texts = self._resolve_emotion(emotion) + if context_texts: + logger.info(f"{self.log_prefix} 使用emotion参数: {emotion} -> {context_texts[0]}") + + # 否则使用配置文件的默认值 + if not context_texts: + context_texts = self.get_config(ConfigKeys.DOUBAO_CONTEXT_TEXTS, None) + + if context_texts: + request_data["req_params"]["context_texts"] = context_texts + + logger.info(f"{self.log_prefix} 豆包语音请求: text='{text[:50]}...' (共{len(text)}字符), voice={voice}") + + try: + session_manager = await TTSSessionManager.get_instance() + async with session_manager.post( + api_url, + json=request_data, + headers=headers, + backend_name="doubao", + timeout=timeout + ) as response: + logger.info(f"{self.log_prefix} 豆包API响应状态码: {response.status}") + + if response.status == 200: + # 使用新的流式响应解析器 + audio_data, error_msg = await DoubaoStreamParser.parse_response( + response, + log_prefix=self.log_prefix + ) + + if error_msg: + logger.error(f"{self.log_prefix} 豆包语音解析失败: {error_msg}") + return TTSResult(False, error_msg, backend_name=self.backend_name) + + # 验证音频数据 + is_valid, error_msg = TTSFileManager.validate_audio_data(audio_data) + if not is_valid: + logger.warning(f"{self.log_prefix} 豆包音频数据验证失败: {error_msg}") + return TTSResult(False, f"豆包语音{error_msg}", backend_name=self.backend_name) + + logger.debug(f"{self.log_prefix} 豆包音频数据验证通过 (大小: {len(audio_data)}字节)") + + # 使用统一的发送方法 + audio_format = self.get_config(ConfigKeys.DOUBAO_AUDIO_FORMAT, "mp3") + return await self.send_audio( + audio_data=audio_data, + audio_format=audio_format, + prefix="tts_doubao", + voice_info=f"音色: {voice}" + ) + else: + error_text = await response.text() + logger.error(f"{self.log_prefix} 豆包API请求失败[{response.status}]: {error_text[:200]}") + return TTSResult( + False, + f"豆包语音API调用失败: {response.status} - {error_text[:100]}", + backend_name=self.backend_name + ) + + except asyncio.TimeoutError: + logger.error(f"{self.log_prefix} 豆包API请求超时 (配置超时: {timeout}秒)") + return TTSResult(False, "豆包语音API调用超时", backend_name=self.backend_name) + except Exception as e: + logger.error(f"{self.log_prefix} 豆包语音执行异常: {e}") + return TTSResult(False, f"豆包语音执行错误: {e}", backend_name=self.backend_name) diff --git a/backends/doubao_stream_parser.py b/backends/doubao_stream_parser.py new file mode 100644 index 00000000..a3f61925 --- /dev/null +++ b/backends/doubao_stream_parser.py @@ -0,0 +1,432 @@ +""" +豆包语音流式响应解析器 +基于官方示例实现,确保兼容性和正确性 + +官方API说明: +- code=0: 继续处理,可能包含 "data"(音频)或 "sentence"(文本) +- code=20000000: 结束标志,可能包含 "usage"(用量统计) +- code>0: 错误响应 +""" + +import json +import base64 +from typing import Tuple, Optional, List +from src.common.logger import get_logger + +logger = get_logger("doubao_stream_parser") + + +class DoubaoStreamParser: + """ + 豆包语音流式响应解析器 + + 基于官方API实现,忠实还原官方示例逻辑。 + 处理流程: + 1. 逐行读取 JSON 响应 + 2. 检查状态码:code=0(继续), code=20000000(结束), code>0(错误) + 3. 提取音频数据(code=0 且有 "data" 字段) + 4. 记录日志(code=0 且有 "sentence" 字段) + """ + + def __init__(self, log_prefix: str = "[DoubaoParser]"): + """ + 初始化解析器 + + Args: + log_prefix: 日志前缀 + """ + self.log_prefix = log_prefix + self._audio_chunks: List[bytes] = [] + self._buffer: bytes = b'' + self._line_count: int = 0 + self._total_bytes: int = 0 + self._error_message: Optional[str] = None + self._finished: bool = False # 是否收到结束信号 + self._usage_info: Optional[dict] = None + + def _decode_audio_from_base64(self, audio_base64: str) -> Optional[bytes]: + """ + 从 Base64 字符串解码音频数据 + + 官方示例中直接使用 base64.b64decode(data["data"]), + 但我们添加了额外的容错和验证。 + + Args: + audio_base64: Base64 编码的音频数据 + + Returns: + 解码后的音频字节数据或 None + """ + if not audio_base64: + return None + + try: + # 官方示例直接调用 base64.b64decode() + # 这里添加容错处理:补充填充符(如果需要) + padding_needed = len(audio_base64) % 4 + if padding_needed: + audio_base64 += '=' * (4 - padding_needed) + logger.debug( + f"{self.log_prefix} Base64填充已应用 " + f"(原长: {len(audio_base64) - (4 - padding_needed)}, 新长: {len(audio_base64)})" + ) + + audio_bytes = base64.b64decode(audio_base64) + + if not audio_bytes: + logger.warning(f"{self.log_prefix} Base64解码结果为空") + return None + + logger.debug( + f"{self.log_prefix} 音频块解码成功 - 大小: {len(audio_bytes)}字节" + ) + return audio_bytes + + except Exception as e: + logger.error( + f"{self.log_prefix} Base64解码失败: {e} " + f"(Base64长度: {len(audio_base64)})" + ) + return None + + def _process_json_line(self, line_str: str) -> Optional[str]: + """ + 处理单行 JSON 数据 + + 严格按照官方示例逻辑: + 1. 检查 code 字段 + 2. code=0 且有 data → 提取音频 + 3. code=0 且有 sentence → 记录文本(可选) + 4. code=20000000 → 收到结束信号 + 5. code>0 → 错误 + + Args: + line_str: JSON 字符串 + + Returns: + 如果收到结束信号,返回 "END";如果发生错误,返回错误信息;否则返回 None + """ + try: + json_obj = json.loads(line_str) + except json.JSONDecodeError as e: + logger.debug(f"{self.log_prefix} JSON解析失败: {e}") + return None + except Exception as e: + logger.warning(f"{self.log_prefix} JSON处理异常: {e}") + return None + + if not isinstance(json_obj, dict): + logger.debug( + f"{self.log_prefix} 收到非字典JSON对象: {type(json_obj).__name__}" + ) + return None + + code = json_obj.get("code", -1) + + # ✅ 官方逻辑:处理 code=0 的数据帧 + if code == 0: + # 检查是否有音频数据 + if "data" in json_obj and json_obj["data"]: + chunk_audio = self._decode_audio_from_base64(json_obj["data"]) + if chunk_audio: + self._audio_chunks.append(chunk_audio) + logger.debug( + f"{self.log_prefix} 音频块#{len(self._audio_chunks)} 已接收 " + f"(大小: {len(chunk_audio)}字节)" + ) + + # 检查是否有文本/句子信息(可选) + if "sentence" in json_obj and json_obj["sentence"]: + sentence_data = json_obj.get("sentence", {}) + logger.debug( + f"{self.log_prefix} 收到句子数据: {sentence_data}" + ) + + return None # 继续处理 + + # ✅ 官方逻辑:处理 code=20000000 的结束帧 + elif code == 20000000: + logger.info(f"{self.log_prefix} 收到流结束信号 (code=20000000)") + + # 记录用量信息(如果有) + if "usage" in json_obj: + self._usage_info = json_obj["usage"] + logger.info( + f"{self.log_prefix} 豆包用量信息: {self._usage_info}" + ) + + self._finished = True + return "END" # 表示流已结束 + + # ✅ 官方逻辑:错误处理 + elif code and code > 0: + error_msg = json_obj.get("message", f"未知错误 (code={code})") + logger.error( + f"{self.log_prefix} 豆包语音API返回错误 " + f"(code={code}): {error_msg}" + ) + self._error_message = error_msg + return error_msg # 返回错误信息 + + # 未知状态码 + else: + logger.debug( + f"{self.log_prefix} 收到未知状态码: code={code}" + ) + return None + + def _find_data_chunk_offset(self, header: bytes) -> int: + """ + 在 WAV header 中查找 'data' 块的位置 + + 豆包返回的 WAV 可能包含额外的元数据块(如 LIST/INFO), + 导致 'data' 块不在标准的 44 字节位置。 + + Args: + header: WAV 文件头部数据 + + Returns: + data 块数据开始的位置(即 'data' + 4字节大小之后) + """ + pos = 12 # 跳过 RIFF(4) + size(4) + WAVE(4) + + while pos < len(header) - 8: + chunk_id = header[pos:pos+4] + chunk_size = int.from_bytes(header[pos+4:pos+8], 'little') + + if chunk_id == b'data': + return pos + 8 # 返回音频数据开始位置 + + # 移动到下一个块 + pos += 8 + chunk_size + # WAV 块需要对齐到偶数字节 + if chunk_size % 2 == 1: + pos += 1 + + # 未找到 data 块,返回默认值 + return 44 + + def _merge_audio_chunks(self, chunks: List[bytes]) -> bytes: + """ + 合并音频块,处理 WAV 格式的流式响应 + + 豆包流式 WAV 响应特点: + 1. 第一个块包含完整 header(可能 > 44 字节,含 LIST/INFO 元数据) + 2. header 中的大小字段是 0xFFFFFFFF(流式占位符) + 3. 后续块是纯音频数据(无 header) + 4. 需要在合并后修正大小字段 + + Args: + chunks: 音频数据块列表 + + Returns: + 合并后的有效 WAV 文件 + """ + if not chunks: + return b'' + + first_chunk = chunks[0] + + # 检查是否是 WAV 格式(RIFF header) + if len(first_chunk) < 44 or first_chunk[:4] != b'RIFF': + # 不是 WAV 格式(如 MP3),直接拼接 + return b''.join(chunks) + + # 查找 data 块的实际位置 + data_offset = self._find_data_chunk_offset(first_chunk) + logger.debug(f"{self.log_prefix} WAV data 块偏移: {data_offset} 字节") + + # 提取 header 和第一块的音频数据 + header = bytearray(first_chunk[:data_offset]) + data_parts = [first_chunk[data_offset:]] + skipped_headers = 0 + + # 处理后续块 + for chunk in chunks[1:]: + if len(chunk) > 44 and chunk[:4] == b'RIFF': + # 后续块也有 RIFF header,需要跳过 + chunk_data_offset = self._find_data_chunk_offset(chunk) + data_parts.append(chunk[chunk_data_offset:]) + skipped_headers += 1 + else: + # 纯音频数据 + data_parts.append(chunk) + + # 合并所有音频数据 + audio_data = b''.join(data_parts) + audio_size = len(audio_data) + + # 修正 WAV header 中的大小字段 + # 字节 4-7: 文件总大小 - 8 = (header_size - 8) + audio_size + file_size = len(header) - 8 + audio_size + header[4:8] = file_size.to_bytes(4, 'little') + + # 修正 data 块的大小字段(位于 data_offset - 4 处) + header[data_offset-4:data_offset] = audio_size.to_bytes(4, 'little') + + if skipped_headers > 0 or audio_size > 0: + logger.info( + f"{self.log_prefix} WAV 流式合并完成: " + f"header={len(header)}字节, 音频={audio_size}字节, " + f"跳过重复header={skipped_headers}" + ) + + return bytes(header) + audio_data + + def feed_chunk(self, chunk: bytes) -> Optional[str]: + """ + 输入一块数据 + + Args: + chunk: 网络数据块 + + Returns: + 如果遇到错误或结束,返回相应信息;否则返回 None + """ + if not chunk: + return None + + self._buffer += chunk + self._total_bytes += len(chunk) + + # 按行处理(官方示例使用 iter_lines) + while b'\n' in self._buffer: + line_bytes, self._buffer = self._buffer.split(b'\n', 1) + + # 尝试解码行数据 + try: + line_str = line_bytes.decode('utf-8', errors='replace').strip() + except Exception as e: + logger.warning( + f"{self.log_prefix} 行解码失败: {e}, 跳过该行" + ) + self._line_count += 1 + continue + + if not line_str: + continue + + self._line_count += 1 + + # 处理该行 + result = self._process_json_line(line_str) + + # 如果收到结束信号或错误,立即返回 + if result == "END": + return None # 正常结束 + elif result: # 返回的是错误信息 + return result + + return None + + def finalize(self) -> Tuple[Optional[bytes], Optional[str]]: + """ + 完成解析,处理剩余数据 + + Returns: + (audio_data, error_message) + - audio_data: 合并后的音频数据(成功时) + - error_message: 错误信息(失败时) + """ + # 处理剩余的 buffer 中的最后一行 + if self._buffer.strip(): + try: + line_str = self._buffer.decode('utf-8', errors='replace').strip() + if line_str: + logger.debug( + f"{self.log_prefix} 处理最后的buffer数据 " + f"(长度: {len(line_str)}字符)" + ) + result = self._process_json_line(line_str) + if result and result != "END": + # 最后的 buffer 包含错误 + self._error_message = result + except Exception as e: + logger.warning( + f"{self.log_prefix} 最后buffer解析异常: {e}" + ) + + logger.info( + f"{self.log_prefix} 豆包流解析完成 - " + f"处理行数: {self._line_count}, " + f"音频块数: {len(self._audio_chunks)}, " + f"接收字节数: {self._total_bytes}, " + f"正常结束: {self._finished}" + ) + + # 检查是否有错误 + if self._error_message: + logger.error( + f"{self.log_prefix} 豆包API返回错误: {self._error_message}" + ) + return None, f"豆包语音API错误: {self._error_message}" + + # 检查是否有音频数据 + if not self._audio_chunks: + if self._total_bytes == 0: + logger.warning( + f"{self.log_prefix} 豆包API未返回任何数据" + ) + return None, "未收到任何响应数据" + + logger.warning( + f"{self.log_prefix} 收到 {self._total_bytes} 字节数据但无音频块" + ) + return None, "豆包语音未返回任何音频数据" + + # ✅ 额外的数据完整性检查 + # 过滤掉过小的块(可能是损坏或无效的) + min_chunk_size = 50 # 最小块大小 + valid_chunks = [ + chunk for chunk in self._audio_chunks + if len(chunk) >= min_chunk_size + ] + + if not valid_chunks: + logger.error( + f"{self.log_prefix} 所有音频块都太小 (可能是损坏的数据)" + ) + logger.debug( + f"{self.log_prefix} 块大小分布: {[len(c) for c in self._audio_chunks]}" + ) + return None, "音频数据不完整或已损坏" + + # 合并所有有效的音频数据(处理 WAV 多 header 问题) + merged_audio = self._merge_audio_chunks(valid_chunks) + + logger.info( + f"{self.log_prefix} 音频合并完成 - " + f"有效块数: {len(valid_chunks)}/{len(self._audio_chunks)}, " + f"总大小: {len(merged_audio)}字节" + ) + + return merged_audio, None + + @classmethod + async def parse_response( + cls, + response, + log_prefix: str = "[DoubaoParser]" + ) -> Tuple[Optional[bytes], Optional[str]]: + """ + 解析豆包 API 的流式响应 + + Args: + response: aiohttp 响应对象 + log_prefix: 日志前缀 + + Returns: + (audio_data, error_message) + """ + parser = cls(log_prefix) + + # 逐块读取响应流 + async for chunk in response.content.iter_any(): + result = parser.feed_chunk(chunk) + + # 如果遇到错误,立即返回 + if result and result != "END": + return None, result + + # 完成解析,处理剩余数据 + return parser.finalize() diff --git a/backends/gpt_sovits.py b/backends/gpt_sovits.py new file mode 100644 index 00000000..126851ff --- /dev/null +++ b/backends/gpt_sovits.py @@ -0,0 +1,326 @@ +""" +GPT-SoVITS 后端实现 +使用本地 GPT-SoVITS 服务进行语音合成 +""" + +import asyncio +from typing import Optional, Dict, Any, Tuple, ClassVar +from .base import TTSBackendBase, TTSResult +from ..utils.text import TTSTextUtils +from ..utils.file import TTSFileManager +from ..utils.session import TTSSessionManager +from ..config_keys import ConfigKeys +from src.common.logger import get_logger + +logger = get_logger("tts_gpt_sovits") + + +class GPTSoVITSBackend(TTSBackendBase): + """ + GPT-SoVITS 后端 + + 使用本地 GPT-SoVITS 服务进行高度定制化的语音合成 + 支持动态切换 GPT 和 SoVITS 模型权重 + """ + + backend_name = "gpt_sovits" + backend_description = "本地GPT-SoVITS服务" + support_private_chat = True + default_audio_format = "mp3" + + # 类变量:记录当前加载的模型路径,避免重复切换 + _current_gpt_weights: ClassVar[Optional[str]] = None + _current_sovits_weights: ClassVar[Optional[str]] = None + + def get_default_voice(self) -> str: + """获取默认风格""" + return "default" + + async def _switch_model( + self, + server: str, + gpt_weights: Optional[str], + sovits_weights: Optional[str], + timeout: int + ) -> Tuple[bool, str]: + """ + 切换 GPT-SoVITS 模型权重 + + Args: + server: 服务器地址 + gpt_weights: GPT 模型权重路径 + sovits_weights: SoVITS 模型权重路径 + timeout: 超时时间 + + Returns: + (success, error_message) + """ + session_manager = await TTSSessionManager.get_instance() + + async def _set_model_v1() -> Tuple[bool, str]: + # 兼容旧版 api.py: 仅支持 /set_model 同时切换 + if not gpt_weights or not sovits_weights: + return False, "当前GPT-SoVITS服务不支持单独切换模型(请同时配置GPT与SoVITS权重)" + set_model_url = ( + f"{server.rstrip('/')}/set_model?" + f"gpt_model_path={gpt_weights}&sovits_model_path={sovits_weights}" + ) + logger.info(f"{self.log_prefix} 切换模型(兼容模式): {gpt_weights} | {sovits_weights}") + try: + async with session_manager.get( + set_model_url, + backend_name="gpt_sovits", + timeout=timeout + ) as response: + if response.status == 200: + GPTSoVITSBackend._current_gpt_weights = gpt_weights + GPTSoVITSBackend._current_sovits_weights = sovits_weights + logger.info(f"{self.log_prefix} 模型切换成功(兼容模式)") + return True, "" + error_text = await response.text() + return False, f"模型切换失败: {error_text}" + except Exception as e: + return False, f"模型切换异常: {e}" + + # 切换 GPT 权重 + if gpt_weights and gpt_weights != GPTSoVITSBackend._current_gpt_weights: + gpt_url = f"{server.rstrip('/')}/set_gpt_weights?weights_path={gpt_weights}" + logger.info(f"{self.log_prefix} 切换GPT模型: {gpt_weights}") + + try: + async with session_manager.get( + gpt_url, + backend_name="gpt_sovits", + timeout=timeout + ) as response: + if response.status == 200: + GPTSoVITSBackend._current_gpt_weights = gpt_weights + logger.info(f"{self.log_prefix} GPT模型切换成功") + elif response.status == 404: + # 旧版服务没有 /set_gpt_weights + return await _set_model_v1() + else: + error_text = await response.text() + return False, f"GPT模型切换失败: {error_text}" + except Exception as e: + return False, f"GPT模型切换异常: {e}" + + # 切换 SoVITS 权重 + if sovits_weights and sovits_weights != GPTSoVITSBackend._current_sovits_weights: + sovits_url = f"{server.rstrip('/')}/set_sovits_weights?weights_path={sovits_weights}" + logger.info(f"{self.log_prefix} 切换SoVITS模型: {sovits_weights}") + + try: + async with session_manager.get( + sovits_url, + backend_name="gpt_sovits", + timeout=timeout + ) as response: + if response.status == 200: + GPTSoVITSBackend._current_sovits_weights = sovits_weights + logger.info(f"{self.log_prefix} SoVITS模型切换成功") + elif response.status == 404: + # 旧版服务没有 /set_sovits_weights + return await _set_model_v1() + else: + error_text = await response.text() + return False, f"SoVITS模型切换失败: {error_text}" + except Exception as e: + return False, f"SoVITS模型切换异常: {e}" + + return True, "" + + def _normalize_styles_config(self, styles_config: Any) -> Dict[str, Any]: + """ + 规范化风格配置格式 + + 支持两种格式: + 1. 旧格式(字典): {"default": {...}, "happy": {...}} + 2. 新格式(数组): [{"name": "default", ...}, {"name": "happy", ...}] + + 统一转换为字典格式供内部使用 + """ + # 如果是字典格式(旧格式),直接返回 + if isinstance(styles_config, dict): + return styles_config + + # 如果是数组格式(新格式),转换为字典 + if isinstance(styles_config, list): + result = {} + for style in styles_config: + if isinstance(style, dict) and "name" in style: + style_name = style["name"] + # 复制配置,移除 name 字段 + style_data = {k: v for k, v in style.items() if k != "name"} + result[style_name] = style_data + return result + + # 其他情况返回空字典 + return {} + + def validate_config(self) -> Tuple[bool, str]: + """验证配置""" + styles_raw = self.get_config(ConfigKeys.GPT_SOVITS_STYLES, {}) + styles = self._normalize_styles_config(styles_raw) + + if not styles or "default" not in styles: + return False, "GPT-SoVITS未配置任何语音风格" + + default_style = styles.get("default", {}) + if not default_style.get("refer_wav") or not default_style.get("prompt_text"): + return False, "GPT-SoVITS默认风格配置不完整(需要refer_wav和prompt_text)" + + return True, "" + + async def execute( + self, + text: str, + voice: Optional[str] = None, + **kwargs + ) -> TTSResult: + """ + 执行GPT-SoVITS语音合成 + + Args: + text: 待转换的文本 + voice: 风格名称 + + Returns: + TTSResult + """ + # 验证文本 + if not text or not text.strip(): + return TTSResult(False, "待合成的文本为空", backend_name=self.backend_name) + + # 获取配置 + server = self.get_config(ConfigKeys.GPT_SOVITS_SERVER, "http://127.0.0.1:9880") + styles_raw = self.get_config(ConfigKeys.GPT_SOVITS_STYLES, {}) + styles = self._normalize_styles_config(styles_raw) + timeout = self.get_config(ConfigKeys.GENERAL_TIMEOUT, 60) + + # 确定使用的风格 + voice_style = voice if voice and voice in styles else "default" + + if voice_style not in styles: + return TTSResult( + False, + f"GPT-SoVITS风格 '{voice_style}' 未配置", + backend_name=self.backend_name + ) + + style_config = styles[voice_style] + refer_wav_path = style_config.get("refer_wav", "") + prompt_text = style_config.get("prompt_text", "") + prompt_language = style_config.get("prompt_language", "zh") + gpt_weights = style_config.get("gpt_weights") + sovits_weights = style_config.get("sovits_weights") + + if not refer_wav_path or not prompt_text: + return TTSResult( + False, + f"GPT-SoVITS风格 '{voice_style}' 配置不完整", + backend_name=self.backend_name + ) + + # 如果配置了模型权重,先切换模型 + if gpt_weights or sovits_weights: + switch_success, switch_error = await self._switch_model( + server, gpt_weights, sovits_weights, timeout + ) + if not switch_success: + return TTSResult(False, switch_error, backend_name=self.backend_name) + + # 检测文本语言 + text_language = TTSTextUtils.detect_language(text) + + # 构建请求数据 + data = { + "text": text, + "text_lang": text_language, + "ref_audio_path": refer_wav_path, + "prompt_text": prompt_text, + "prompt_lang": prompt_language + } + + tts_url = f"{server.rstrip('/')}/tts" + legacy_tts_url = f"{server.rstrip('/')}/" + legacy_data = { + "text": text, + "text_language": text_language, + "refer_wav_path": refer_wav_path, + "prompt_text": prompt_text, + "prompt_language": prompt_language, + } + + logger.info(f"{self.log_prefix} GPT-SoVITS请求: text='{text[:50]}...', style={voice_style}") + + try: + session_manager = await TTSSessionManager.get_instance() + async with session_manager.post( + tts_url, + json=data, + backend_name="gpt_sovits", + timeout=timeout + ) as response: + if response.status == 200: + audio_data = await response.read() + + # 验证音频数据 + is_valid, error_msg = TTSFileManager.validate_audio_data(audio_data) + if not is_valid: + return TTSResult(False, f"GPT-SoVITS{error_msg}", backend_name=self.backend_name) + + # 使用统一的发送方法 + return await self.send_audio( + audio_data=audio_data, + audio_format="wav", + prefix="tts_gpt_sovits", + voice_info=f"风格: {voice_style}" + ) + elif response.status == 404: + # 兼容旧版 api.py:没有 /tts 端点,回退到根路径 + logger.warning(f"{self.log_prefix} /tts 端点不存在,尝试兼容模式请求根路径") + else: + error_info = await response.text() + logger.error(f"{self.log_prefix} GPT-SoVITS API失败[{response.status}]: {error_info[:200]}") + return TTSResult( + False, + f"GPT-SoVITS API调用失败: {response.status}", + backend_name=self.backend_name + ) + + # 仅在 /tts 404 时回退到旧版根路径 + async with session_manager.post( + legacy_tts_url, + json=legacy_data, + backend_name="gpt_sovits", + timeout=timeout + ) as response: + if response.status == 200: + audio_data = await response.read() + + # 验证音频数据 + is_valid, error_msg = TTSFileManager.validate_audio_data(audio_data) + if not is_valid: + return TTSResult(False, f"GPT-SoVITS{error_msg}", backend_name=self.backend_name) + + return await self.send_audio( + audio_data=audio_data, + audio_format="wav", + prefix="tts_gpt_sovits", + voice_info=f"风格: {voice_style}" + ) + else: + error_info = await response.text() + logger.error(f"{self.log_prefix} GPT-SoVITS API失败[{response.status}]: {error_info[:200]}") + return TTSResult( + False, + f"GPT-SoVITS API调用失败: {response.status}", + backend_name=self.backend_name + ) + + except asyncio.TimeoutError: + return TTSResult(False, "GPT-SoVITS API调用超时", backend_name=self.backend_name) + except Exception as e: + logger.error(f"{self.log_prefix} GPT-SoVITS执行错误: {e}") + return TTSResult(False, f"GPT-SoVITS执行错误: {e}", backend_name=self.backend_name) diff --git a/backends/gsv2p.py b/backends/gsv2p.py new file mode 100644 index 00000000..8837d881 --- /dev/null +++ b/backends/gsv2p.py @@ -0,0 +1,186 @@ +""" +GSV2P 后端实现 +使用 GSV2P 云端 API 进行语音合成 +""" + +import asyncio +import json +from typing import Optional, Dict, Any, Tuple +from .base import TTSBackendBase, TTSResult +from ..utils.file import TTSFileManager +from ..utils.session import TTSSessionManager +from ..config_keys import ConfigKeys +from src.common.logger import get_logger + +logger = get_logger("tts_gsv2p") + +# 重试配置 +MAX_RETRIES = 5 # 最大重试次数 +RETRY_DELAY = 3.0 # 重试间隔(秒) + + +class GSV2PBackend(TTSBackendBase): + """ + GSV2P 后端 + + 使用 GSV2P 云端 API 进行高质量语音合成 + """ + + backend_name = "gsv2p" + backend_description = "GSV2P云端API语音合成" + support_private_chat = True + default_audio_format = "mp3" + + def get_default_voice(self) -> str: + """获取默认音色""" + return self.get_config(ConfigKeys.GSV2P_DEFAULT_VOICE, "原神-中文-派蒙_ZH") + + def validate_config(self) -> Tuple[bool, str]: + """验证配置""" + api_token = self.get_config(ConfigKeys.GSV2P_API_TOKEN, "") + if not api_token: + return False, "GSV2P后端缺少API Token配置" + return True, "" + + async def _make_request( + self, + api_url: str, + request_data: Dict[str, Any], + headers: Dict[str, str], + timeout: int + ) -> Tuple[bool, Any, str]: + """ + 发送单次API请求 + + Returns: + (成功标志, 音频数据或None, 错误信息) + """ + session_manager = await TTSSessionManager.get_instance() + async with session_manager.post( + api_url, + json=request_data, + headers=headers, + backend_name="gsv2p", + timeout=timeout + ) as response: + if response.status == 200: + content_type = response.headers.get('Content-Type', '') + audio_data = await response.read() + + # 检查是否返回了JSON错误(服务端不稳定时会返回参数错误) + if 'application/json' in content_type: + try: + error_json = json.loads(audio_data.decode('utf-8')) + error_msg = error_json.get('error', {}).get('message', str(error_json)) + # 参数错误通常是服务端临时问题,可以重试 + return False, None, f"API返回错误: {error_msg}" + except Exception: + return False, None, "API返回异常响应" + + # 验证音频数据 + is_valid, error_msg = TTSFileManager.validate_audio_data(audio_data) + if not is_valid: + return False, None, f"音频数据无效: {error_msg}" + + return True, audio_data, "" + else: + error_text = await response.text() + return False, None, f"API调用失败: {response.status} - {error_text[:100]}" + + async def execute( + self, + text: str, + voice: Optional[str] = None, + **kwargs + ) -> TTSResult: + """ + 执行GSV2P语音合成(带重试机制) + + Args: + text: 待转换的文本 + voice: 音色名称 + + Returns: + TTSResult + """ + # 验证配置 + is_valid, error_msg = self.validate_config() + if not is_valid: + return TTSResult(False, error_msg, backend_name=self.backend_name) + + # 验证文本 + if not text or not text.strip(): + return TTSResult(False, "待合成的文本为空", backend_name=self.backend_name) + + # 获取配置 + api_url = self.get_config(ConfigKeys.GSV2P_API_URL, "https://gsv2p.acgnai.top/v1/audio/speech") + api_token = self.get_config(ConfigKeys.GSV2P_API_TOKEN, "") + timeout = self.get_config(ConfigKeys.GSV2P_TIMEOUT, 30) + + if not voice: + voice = self.get_default_voice() + + # 构建请求参数(注意:other_params 已被 API 废弃,不再支持) + request_data: Dict[str, Any] = { + "model": self.get_config(ConfigKeys.GSV2P_MODEL, "tts-v4"), + "input": text, + "voice": voice, + "response_format": self.get_config(ConfigKeys.GSV2P_RESPONSE_FORMAT, "mp3"), + "speed": self.get_config(ConfigKeys.GSV2P_SPEED, 1) + } + + headers = { + "accept": "application/json", + "Authorization": f"Bearer {api_token}", + "Content-Type": "application/json" + } + + logger.info(f"{self.log_prefix} GSV2P请求: text='{text[:50]}...', voice={voice}") + logger.debug(f"{self.log_prefix} GSV2P完整请求参数: {json.dumps(request_data, ensure_ascii=False, indent=2)}") + + last_error = "" + for attempt in range(1, MAX_RETRIES + 1): + try: + success, audio_data, error_msg = await self._make_request( + api_url, request_data, headers, timeout + ) + + if success and audio_data: + if attempt > 1: + logger.info(f"{self.log_prefix} GSV2P第{attempt}次重试成功") + + logger.info(f"{self.log_prefix} GSV2P响应: 数据大小={len(audio_data)}字节") + + # 使用统一的发送方法 + audio_format = self.get_config(ConfigKeys.GSV2P_RESPONSE_FORMAT, "mp3") + return await self.send_audio( + audio_data=audio_data, + audio_format=audio_format, + prefix="tts_gsv2p", + voice_info=f"音色: {voice}" + ) + else: + last_error = error_msg + if attempt < MAX_RETRIES: + logger.warning(f"{self.log_prefix} GSV2P请求失败 ({error_msg}), {RETRY_DELAY}秒后重试 (尝试 {attempt}/{MAX_RETRIES})") + await asyncio.sleep(RETRY_DELAY) + else: + logger.error(f"{self.log_prefix} GSV2P请求失败,已达最大重试次数: {error_msg}") + + except asyncio.TimeoutError: + last_error = "API调用超时" + if attempt < MAX_RETRIES: + logger.warning(f"{self.log_prefix} GSV2P超时, {RETRY_DELAY}秒后重试 (尝试 {attempt}/{MAX_RETRIES})") + await asyncio.sleep(RETRY_DELAY) + else: + logger.error(f"{self.log_prefix} GSV2P超时,已达最大重试次数") + + except Exception as e: + last_error = str(e) + logger.error(f"{self.log_prefix} GSV2P执行错误: {e}") + if attempt < MAX_RETRIES: + await asyncio.sleep(RETRY_DELAY) + else: + break + + return TTSResult(False, f"GSV2P {last_error} (已重试{MAX_RETRIES}次)", backend_name=self.backend_name) diff --git a/config.toml b/config.toml new file mode 100644 index 00000000..9c045560 --- /dev/null +++ b/config.toml @@ -0,0 +1,292 @@ +# tts_voice_plugin - 自动生成的配置文件 +# 统一TTS语音合成插件,整合AI Voice、GSV2P、GPT-SoVITS、豆包语音、CosyVoice五种后端引擎,提供灵活的语音合成能力。 + +# 插件基本配置 +[plugin] + +# 是否启用插件 +enabled = true + +# 配置文件版本 +config_version = "3.2.3" + +# 通用设置 + +[general] + +# 默认TTS后端 (ai_voice/gsv2p/gpt_sovits/doubao/cosyvoice/comfyui) +# 可选: ai_voice/gsv2p/gpt_sovits/doubao/cosyvoice/comfyui/comfyui_voiceclone/comfyui_customvoice +default_backend = "comfyui_customvoice" + +# 请求超时时间(秒) +timeout = 60 + +# 最大文本长度(该限制会在调用LLM时注入到prompt中,让LLM直接生成符合长度的回复,而不是被动截断) +max_text_length = 200 + +# 是否使用replyer润色语音内容 +use_replyer_rewrite = true + +# 音频文件输出目录(支持相对路径和绝对路径,留空使用项目根目录) +audio_output_dir = "" + +# 是否使用base64编码发送音频(备选方案) +use_base64_audio = true + +# 是否分段发送语音(每句话单独发送一条语音,避免长语音播放问题) +split_sentences = true + +# 分段发送时每条语音之间的延迟(秒) +split_delay = 0.3 + +# 自动分段启用阈值:文本长度小于该值时不分段(避免短句被切成多段) +split_min_total_chars = 120 + +# 句子最小长度:过短片段会合并到前一句(用于减少碎片段) +split_min_sentence_chars = 6 + +# 自动分段最大段数(避免刷屏式多段语音)。0 表示不限制。 +split_max_segments = 3 + +# 自动分段打包目标长度(字符)。用于把多句合并成更少段。 +split_chunk_chars = 110 + +# 是否发送错误提示消息(关闭后语音合成失败时不会发送错误信息给用户) +send_error_messages = true + +# 组件启用控制 + +[components] + +# 是否启用Action组件 +action_enabled = true + +# 是否启用Command组件 +command_enabled = true + +# 是否启用 instruct 调试命令组件(/tts_instruct) +instruct_command_enabled = true + +# 概率控制配置 + +[probability] + +# 是否启用概率控制 +enabled = true + +# 基础触发概率 +base_probability = 1 + +# 关键词强制触发 +keyword_force_trigger = true + +# 强制触发关键词 +force_keywords = [ + "一定要用语音", + "必须语音", + "语音回复我", + "务必用语音", +] + +# AI Voice后端配置 + +[ai_voice] + +# 默认音色(可选:小新、猴哥、四郎、东北老妹儿、广西大表哥、妲己、霸道总裁、酥心御姐、说书先生、憨憨小弟、憨厚老哥、吕布、元气少女、文艺少女、磁性大叔、邻家小妹、低沉男声、傲娇少女、爹系男友、暖心姐姐、温柔妹妹、书香少女) +default_character = "邻家小妹" + +# GSV2P后端配置 + +[gsv2p] + +# GSV2P API地址 +api_url = "https://gsv2p.acgnai.top/v1/audio/speech" + +# API认证Token +api_token = "" + +# 默认音色 +default_voice = "原神-中文-派蒙_ZH" + +# API请求超时(秒) +timeout = 149 + +# TTS模型 +model = "tts-v4" + +# 音频格式 +response_format = "wav" + +# 语音速度 +speed = 1 + +# GPT-SoVITS后端配置 + +[gpt_sovits] + +# GPT-SoVITS服务地址 +server = "http://127.0.0.1:9880" + +# 语音风格配置 + +# 豆包语音后端配置 + +[[gpt_sovits.styles]] +name = "default" +refer_wav = "/Users/xenon/Downloads/seiun_tts/qingyun_tiankong_voice/s978ztt245c3jxms6apadwgna4e7hmb.mp3" +prompt_text = "私にしてはがんばった方ではないでしょーか?" +prompt_language = "ja" +gpt_weights = "/Users/xenon/Downloads/GPT-SoVITS/GPT_weights_v4/seiun-e15.ckpt" +sovits_weights = "/Users/xenon/Downloads/GPT-SoVITS/SoVITS_weights_v4/seiun_e2_s144_l32.pth" + +[[gpt_sovits.styles]] +name = "" +refer_wav = "" +prompt_text = "" +prompt_language = "zh" +gpt_weights = "" +sovits_weights = "" + +[doubao] + +# 豆包语音API地址 +api_url = "https://openspeech.bytedance.com/api/v3/tts/unidirectional" + +# 豆包APP ID +app_id = "" + +# 豆包Access Key +access_key = "" + +# 豆包Resource ID +resource_id = "seed-tts-2.0" + +# 默认音色 +default_voice = "zh_female_vv_uranus_bigtts" + +# API请求超时(秒) +timeout = 60 + +# 音频格式 +audio_format = "wav" + +# 采样率 +sample_rate = 24000 + +# 比特率 +bitrate = 128000 + +# CosyVoice后端配置 + +[cosyvoice] + +# Gradio API地址 +gradio_url = "https://funaudiollm-fun-cosyvoice3-0-5b.ms.show/" + +# 推理模式(3s极速复刻/自然语言控制) +default_mode = "3s极速复刻" + +# 默认指令(用于自然语言控制模式) +default_instruct = "You are a helpful assistant. 请用广东话表达。<|endofprompt|>" + +# 参考音频路径(用于3s极速复刻模式) +reference_audio = "" + +# 提示文本(用于3s极速复刻模式) +prompt_text = "" + +# API请求超时(秒) +timeout = 300 + +# 音频格式 +audio_format = "wav" + +[comfyui] +server = "http://127.0.0.1:8188" +# 必须是 ComfyUI 的 input 目录, backend 会把 refer_wav 复制进去, 再用 LoadAudio 读取 +input_dir = "/Users/xenon/Downloads/seiun_tts/ComfyUI/ComfyUI/input" +timeout = 120 +audio_quality = "128k" # SaveAudioMP3: V0/128k/320k +mlx_python = "/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/.venv/bin/python" +mlx_cli = "/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/mlx_voice_clone_cli.py" +default_style = "default" +# Split comfyui backend into two convenient aliases: +# - comfyui_voiceclone: only uses styles whose mode is voice_clone (or absent) +# - comfyui_customvoice: only uses styles whose mode is custom_voice +# These keys let you pick different defaults without duplicating comfyui.styles. +voiceclone_default_style = "default" +customvoice_default_style = "seiun" +auto_instruct_enabled = true +auto_instruct_max_chars = 320 + +# 自动推断 instruct 时固定附加的“基调”(persona)。会作为 `基调=...;` 前缀插入。 +# 注意:值里不要包含 ';' 或 '='(backend 会做清洗,但建议从源头避免)。 +auto_instruct_base_tone = "女性约15-16岁,清澈透亮但慵懒的轻女高音,句尾元音随意拉长且略带鼻腔撒娇,咬字松弛像刚睡醒,可在慵懒与冷静锐利间切换,带戏谑亲和" + +# 可选:完整基调原文(保留备份,当前不启用) +# auto_instruct_base_tone = """ +# 女性,外表约15-16岁,音色是清澈透亮却带有慵懒感的轻女高音(Light Soprano)。 +# +# 嗓音轻盈飘逸,带有明显的“云朵般”的漂浮感,起初是漫不经心的拖沓语调,其特征在于句尾元音的随意拉长(Drawl)以及略带鼻腔共鸣的撒娇感。咬字呈现出一种仿佛刚睡醒般的松弛,甚至伴有刻意为之的含糊,像是一只在阳光下伸懒腰的猫。 +# +# 随后,这种慵懒被一种狡黠的机敏所取代,声音在毫无干劲的叹息与看穿一切的通透感之间自如切换。在表现谋略或胜负欲的瞬间,音色会瞬间收紧,去除了所有的气声装饰与慵懒拖音,转为冷静、干练且直击要害的中高频。 +# +# 表现风格既显得捉摸不透又带有戏谑的亲和力,伴随着轻巧的换气声和偶尔出现的、带有试探意味的升调尾音。仿佛在脱力系(Listless)的无害表象之下,潜藏着绝顶聪明的头脑与绝不让步的自尊。 +# """ + +auto_instruct_prompt = """ +你是精通声学特征与戏剧表演的 AI 配音导演。你的任务是根据「待朗读文本」生成一行 TTS instruct(用于 Qwen3-TTS CustomVoice 的语音表演控制)。 + +硬性要求: +- 只输出一行(单行 KV),不要解释,不要引号/代码块,不要复述原文。 +- 必须同时包含以下字段,并用英文分号 ';' 分隔:情绪、强度、语速、停顿、表现 +- 输出格式固定为:情绪=<...>;强度=<...>;语速=<...>;停顿=<...>;表现=<...> +- 语速可选:很慢/稍慢/正常/稍快/很快 +- 停顿可选:很少/自然/稍多/很多 +- 强度可选:很低/低/中/高/很高 +- 表现:用 3-6 个短提示词,使用逗号分隔(不要用分号),如:声压高,咬字重,重音强,尾音下压 +- 长度 <= {max_chars} 字 + +强制增强规则(避免“生气但听起来不够生气”): +- 如果文本出现:非常/极其/真的/气死/怒/吼/滚/闭嘴/你再说一次 等强烈信号,情绪优先用「愤怒」,强度至少「高」,表现要包含“声压高/咬字重/重音强/尾音下压”中的至少 2 项。 +- 如果是嘲讽或冷笑式的怒气:情绪写「愤怒(冷)」或「愤怒+嘲讽」,表现包含“冷硬/压低/咬字利落/少气声”。 + +文本语言: {lang} +待朗读文本: {text} +""" + +# 基础停顿(秒)。当 instruct 包含“停顿=...”时,会按 很少/自然/稍多/很多 做倍率缩放。 +pause_linebreak = 0.18 +period_pause = 0.22 +comma_pause = 0.1 +question_pause = 0.2 +hyphen_pause = 0.06 + +[[comfyui.styles]] +name = "default" +refer_wav = "/Users/xenon/Downloads/seiun_tts/qingyun_tiankong_voice/default_ref_24k_mono.wav" +prompt_text = "私にしてはがんばった方ではないでしょーか?" +language = "Auto" +model_choice = "1.7B" +precision = "bf16" +seed = 0 +max_new_tokens = 2048 +top_p = 0.8 +top_k = 20 +temperature = 1 +repetition_penalty = 1.05 + +[[comfyui.styles]] +name = "seiun" +mode = "custom_voice" +model_path = "/Users/xenon/Downloads/checkpoint-epoch-9" +speaker = "seiun" +instruct = "__AUTO__" +speed = 1 +language = "Auto" +seed = 0 +max_new_tokens = 2048 +top_p = 0.9 +top_k = 20 +temperature = 0.9 +repetition_penalty = 1.05 diff --git a/config_keys.py b/config_keys.py new file mode 100644 index 00000000..b7993eca --- /dev/null +++ b/config_keys.py @@ -0,0 +1,103 @@ +""" +配置键常量定义 +集中管理所有配置键,避免硬编码 +""" + + +class ConfigKeys: + """配置键常量类""" + + # ========== Plugin 配置 ========== + PLUGIN_ENABLED = "plugin.enabled" + PLUGIN_CONFIG_VERSION = "plugin.config_version" + + # ========== General 通用配置 ========== + GENERAL_DEFAULT_BACKEND = "general.default_backend" + GENERAL_TIMEOUT = "general.timeout" + GENERAL_MAX_TEXT_LENGTH = "general.max_text_length" + GENERAL_USE_REPLYER_REWRITE = "general.use_replyer_rewrite" + GENERAL_AUDIO_OUTPUT_DIR = "general.audio_output_dir" + GENERAL_USE_BASE64_AUDIO = "general.use_base64_audio" + GENERAL_SPLIT_SENTENCES = "general.split_sentences" + GENERAL_SPLIT_DELAY = "general.split_delay" + GENERAL_SPLIT_MIN_TOTAL_CHARS = "general.split_min_total_chars" + GENERAL_SPLIT_MIN_SENTENCE_CHARS = "general.split_min_sentence_chars" + GENERAL_SPLIT_MAX_SEGMENTS = "general.split_max_segments" + GENERAL_SPLIT_CHUNK_CHARS = "general.split_chunk_chars" + GENERAL_SEND_ERROR_MESSAGES = "general.send_error_messages" + + # ========== Components 组件配置 ========== + COMPONENTS_ACTION_ENABLED = "components.action_enabled" + COMPONENTS_COMMAND_ENABLED = "components.command_enabled" + COMPONENTS_INSTRUCT_COMMAND_ENABLED = "components.instruct_command_enabled" + + # ========== Probability 概率控制配置 ========== + PROBABILITY_ENABLED = "probability.enabled" + PROBABILITY_BASE_PROBABILITY = "probability.base_probability" + PROBABILITY_KEYWORD_FORCE_TRIGGER = "probability.keyword_force_trigger" + PROBABILITY_FORCE_KEYWORDS = "probability.force_keywords" + + # ========== AI Voice 配置 ========== + AI_VOICE_DEFAULT_CHARACTER = "ai_voice.default_character" + AI_VOICE_ALIAS_MAP = "ai_voice.alias_map" + + # ========== GSV2P 配置 ========== + GSV2P_API_URL = "gsv2p.api_url" + GSV2P_API_TOKEN = "gsv2p.api_token" + GSV2P_DEFAULT_VOICE = "gsv2p.default_voice" + GSV2P_TIMEOUT = "gsv2p.timeout" + GSV2P_MODEL = "gsv2p.model" + GSV2P_RESPONSE_FORMAT = "gsv2p.response_format" + GSV2P_SPEED = "gsv2p.speed" + + # ========== GPT-SoVITS 配置 ========== + GPT_SOVITS_SERVER = "gpt_sovits.server" + GPT_SOVITS_STYLES = "gpt_sovits.styles" + + # ========== Doubao 豆包配置 ========== + DOUBAO_API_URL = "doubao.api_url" + DOUBAO_APP_ID = "doubao.app_id" + DOUBAO_ACCESS_KEY = "doubao.access_key" + DOUBAO_RESOURCE_ID = "doubao.resource_id" + DOUBAO_DEFAULT_VOICE = "doubao.default_voice" + DOUBAO_TIMEOUT = "doubao.timeout" + DOUBAO_AUDIO_FORMAT = "doubao.audio_format" + DOUBAO_SAMPLE_RATE = "doubao.sample_rate" + DOUBAO_BITRATE = "doubao.bitrate" + DOUBAO_SPEED = "doubao.speed" + DOUBAO_VOLUME = "doubao.volume" + DOUBAO_CONTEXT_TEXTS = "doubao.context_texts" + + # ========== CosyVoice 配置 ========== + COSYVOICE_GRADIO_URL = "cosyvoice.gradio_url" + COSYVOICE_DEFAULT_MODE = "cosyvoice.default_mode" + COSYVOICE_DEFAULT_INSTRUCT = "cosyvoice.default_instruct" + COSYVOICE_REFERENCE_AUDIO = "cosyvoice.reference_audio" + COSYVOICE_PROMPT_TEXT = "cosyvoice.prompt_text" + COSYVOICE_TIMEOUT = "cosyvoice.timeout" + COSYVOICE_AUDIO_FORMAT = "cosyvoice.audio_format" + + # ========== ComfyUI (Workflow API) 配置 ========== + COMFYUI_SERVER = "comfyui.server" + COMFYUI_INPUT_DIR = "comfyui.input_dir" + COMFYUI_TIMEOUT = "comfyui.timeout" + COMFYUI_DEFAULT_STYLE = "comfyui.default_style" + COMFYUI_STYLES = "comfyui.styles" + # Convenience aliases to split voiceclone/customvoice at the plugin level. + # Both backends still use comfyui.styles, but these keys let you pick different defaults. + COMFYUI_VOICECLONE_DEFAULT_STYLE = "comfyui.voiceclone_default_style" + COMFYUI_CUSTOMVOICE_DEFAULT_STYLE = "comfyui.customvoice_default_style" + COMFYUI_AUDIO_QUALITY = "comfyui.audio_quality" + COMFYUI_MLX_PYTHON = "comfyui.mlx_python" + COMFYUI_MLX_CLI = "comfyui.mlx_cli" + COMFYUI_PAUSE_LINEBREAK = "comfyui.pause_linebreak" + COMFYUI_PERIOD_PAUSE = "comfyui.period_pause" + COMFYUI_COMMA_PAUSE = "comfyui.comma_pause" + COMFYUI_QUESTION_PAUSE = "comfyui.question_pause" + COMFYUI_HYPHEN_PAUSE = "comfyui.hyphen_pause" + + # Auto instruct (CustomVoice) + COMFYUI_AUTO_INSTRUCT_ENABLED = "comfyui.auto_instruct_enabled" + COMFYUI_AUTO_INSTRUCT_BASE_TONE = "comfyui.auto_instruct_base_tone" + COMFYUI_AUTO_INSTRUCT_PROMPT = "comfyui.auto_instruct_prompt" + COMFYUI_AUTO_INSTRUCT_MAX_CHARS = "comfyui.auto_instruct_max_chars" diff --git a/plugin.py b/plugin.py new file mode 100644 index 00000000..8ee2b155 --- /dev/null +++ b/plugin.py @@ -0,0 +1,972 @@ +""" +统一TTS语音合成插件 +支持五种后端:AI Voice (MaiCore内置) / GSV2P (云API) / GPT-SoVITS (本地服务) / 豆包语音 (云API) / CosyVoice (ModelScope Gradio) + +Version: 3.2.3 +Author: 靓仔 +""" + +import sys +sys.dont_write_bytecode = True + +import asyncio +import random +from typing import List, Tuple, Type, Optional + +from src.common.logger import get_logger +from src.plugin_system.base.base_plugin import BasePlugin +from src.plugin_system.apis.plugin_register_api import register_plugin +from src.plugin_system.base.base_action import BaseAction, ActionActivationType +from src.plugin_system.base.base_command import BaseCommand +from src.plugin_system.base.component_types import ComponentInfo, ChatMode +from src.plugin_system.base.config_types import ConfigField +from src.plugin_system.apis import generator_api + +# 导入模块化的后端和工具 +from .backends import TTSBackendRegistry, TTSResult +from .backends.ai_voice import AI_VOICE_ALIAS_MAP +from .backends.doubao import DOUBAO_EMOTION_MAP +from .utils.text import TTSTextUtils +from .config_keys import ConfigKeys + +logger = get_logger("tts_voice_plugin") + +# 有效后端列表 +VALID_BACKENDS = [ + "ai_voice", + "gsv2p", + "gpt_sovits", + "doubao", + "cosyvoice", + "comfyui", + "comfyui_voiceclone", + "comfyui_customvoice", +] + + +class TTSExecutorMixin: + """ + TTS执行器混入类 + + 提供 Action 和 Command 共享的后端执行逻辑 + """ + + def _create_backend(self, backend_name: str): + """ + 创建后端实例 + + Args: + backend_name: 后端名称 + + Returns: + 后端实例 + """ + backend = TTSBackendRegistry.create( + backend_name, + self.get_config, + self.log_prefix + ) + + if backend: + # 注入必要的回调函数 + if hasattr(backend, 'set_send_custom'): + backend.set_send_custom(self.send_custom) + if hasattr(backend, 'set_send_command'): + backend.set_send_command(self.send_command) + + return backend + + async def _execute_backend( + self, + backend_name: str, + text: str, + voice: str = "", + emotion: str = "" + ) -> TTSResult: + """ + 执行指定后端 + + Args: + backend_name: 后端名称 + text: 待转换文本 + voice: 音色 + emotion: 情感(豆包后端) + + Returns: + TTSResult + """ + backend = self._create_backend(backend_name) + + if not backend: + return TTSResult( + success=False, + message=f"未知的TTS后端: {backend_name}" + ) + + # AI Voice 私聊限制检查 + if backend_name == "ai_voice": + is_private = self._check_is_private_chat() + if is_private: + logger.info(f"{self.log_prefix} AI语音仅支持群聊,自动切换到GSV2P后端") + return await self._execute_backend("gsv2p", text, voice, emotion) + + # Pass chat context through for backends that need MaiBot LLM APIs (e.g., comfyui auto_instruct). + chat_stream = None + if hasattr(self, "chat_stream"): + chat_stream = getattr(self, "chat_stream", None) + elif hasattr(self, "message"): + chat_stream = getattr(getattr(self, "message", None), "chat_stream", None) + + return await backend.execute(text, voice, emotion=emotion, chat_stream=chat_stream) + + def _check_is_private_chat(self) -> bool: + """检查是否是私聊""" + # Action 中使用 chat_stream + if hasattr(self, 'chat_stream'): + return not getattr(self.chat_stream, 'group_info', None) + # Command 中使用 message + if hasattr(self, 'message'): + msg_info = getattr(self.message, 'message_info', None) + if msg_info: + return not getattr(msg_info, 'group_info', None) + return False + + def _get_default_backend(self) -> str: + """获取配置的默认后端""" + backend = self.get_config(ConfigKeys.GENERAL_DEFAULT_BACKEND, "gsv2p") + if backend not in VALID_BACKENDS: + logger.warning(f"{self.log_prefix} 配置的默认后端 '{backend}' 无效,使用 gsv2p") + return "gsv2p" + return backend + + async def _send_error(self, message: str) -> None: + """ + 发送错误提示信息(受全局配置控制) + + Args: + message: 错误消息 + """ + if self.get_config(ConfigKeys.GENERAL_SEND_ERROR_MESSAGES, True): + await self.send_text(message) + + +class UnifiedTTSAction(BaseAction, TTSExecutorMixin): + """统一TTS Action - LLM自动触发""" + + action_name = "unified_tts_action" + action_description = "用语音回复(支持AI Voice/GSV2P/GPT-SoVITS/豆包语音多后端)" + activation_type = ActionActivationType.KEYWORD + mode_enable = ChatMode.ALL + parallel_action = False + + activation_keywords = [ + "语音", "说话", "朗读", "念一下", "读出来", + "voice", "speak", "tts", "语音回复", "用语音说", "播报" + ] + keyword_case_sensitive = False + + action_parameters = { + "text": "要转换为语音的文本内容(必填)", + "backend": "TTS后端引擎 (ai_voice/gsv2p/gpt_sovits/doubao/cosyvoice/comfyui/comfyui_voiceclone/comfyui_customvoice,可选,建议省略让系统自动使用配置的默认后端)", + "voice": "音色/风格参数(可选)", + "emotion": "情感/语气参数(可选,仅豆包后端有效)。支持:开心/兴奋/温柔/骄傲/生气/愤怒/伤心/失望/委屈/平静/严肃/疑惑/慢速/快速/小声/大声等" + } + + action_require = [ + "当用户要求用语音回复时使用", + "当回复简短问候语时使用(如早上好、晚安、你好等)", + "当想让回复更活泼生动时可以使用", + "注意:回复内容过长或者过短不适合用语音", + "注意:backend参数建议省略,系统会自动使用配置的默认后端" + ] + + associated_types = ["text", "command"] + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.timeout = self.get_config(ConfigKeys.GENERAL_TIMEOUT, 60) + self.max_text_length = self.get_config(ConfigKeys.GENERAL_MAX_TEXT_LENGTH, 500) + + def _check_force_trigger(self, text: str) -> bool: + """检查是否强制触发""" + if not self.get_config(ConfigKeys.PROBABILITY_KEYWORD_FORCE_TRIGGER, True): + return False + force_keywords = self.get_config( + ConfigKeys.PROBABILITY_FORCE_KEYWORDS, + ["一定要用语音", "必须语音", "语音回复我", "务必用语音"] + ) + return any(kw in text for kw in force_keywords) + + def _probability_check(self, text: str) -> bool: + """概率控制检查""" + if not self.get_config(ConfigKeys.PROBABILITY_ENABLED, True): + return True + + base_prob = self.get_config(ConfigKeys.PROBABILITY_BASE_PROBABILITY, 1.0) + base_prob = max(0.0, min(1.0, base_prob)) + result = random.random() < base_prob + logger.info(f"{self.log_prefix} 概率检查: {base_prob:.2f}, 结果={'通过' if result else '未通过'}") + return result + + async def _get_final_text(self, raw_text: str, reason: str, use_replyer: bool) -> Tuple[bool, str]: + """获取最终要转语音的文本(使用与正常回复一致的prompt参数)""" + max_text_length = self.get_config(ConfigKeys.GENERAL_MAX_TEXT_LENGTH, 200) + + if not use_replyer: + if not raw_text: + return False, "" + return True, raw_text + + try: + # 统一使用 generate_reply 以确保触发 POST_LLM 事件(日程注入) + # rewrite_reply 不会触发 POST_LLM 事件,因此不适用 + # 注意:长度约束放在末尾,利用 LLM 的"近因效应"提高遵守率 + extra_info_parts = [] + if raw_text: + extra_info_parts.append(f"期望的回复内容:{raw_text}") + # 长度约束放在最后,使用更强的表述 + extra_info_parts.append( + f"【重要】你的回复必须控制在{max_text_length}字以内,这是硬性要求。" + f"超过此长度将无法转换为语音。请直接回复核心内容,不要啰嗦。" + ) + + success, llm_response = await generator_api.generate_reply( + chat_stream=self.chat_stream, + reply_message=self.action_message, + reply_reason=reason, + extra_info="\n".join(extra_info_parts), + request_type="tts_voice_plugin", + from_plugin=False # 允许触发POST_LLM事件,使日程注入生效 + ) + if success and llm_response and llm_response.content: + logger.info(f"{self.log_prefix} 语音内容生成成功") + return True, llm_response.content.strip() + + # 如果生成失败但有原始文本,则使用原始文本 + if raw_text: + logger.warning(f"{self.log_prefix} 内容生成失败,使用原始文本") + return True, raw_text + + return False, "" + except Exception as e: + logger.error(f"{self.log_prefix} 调用 replyer 出错: {e}") + return bool(raw_text), raw_text + + async def execute(self) -> Tuple[bool, str]: + def _chunk_sentences( + parts: List[str], target_chars: int, max_chunks: int + ) -> List[str]: + # Greedy packing: reduces tiny fragments into fewer, longer segments. + if not parts: + return [] + if target_chars <= 0: + target_chars = 120 + + def pack(tgt: int) -> List[str]: + out: List[str] = [] + cur = "" + for s in parts: + s = (s or "").strip() + if not s: + continue + if not cur: + cur = s + continue + if len(cur) + len(s) <= tgt: + cur += s + else: + out.append(cur) + cur = s + if cur: + out.append(cur) + return out + + packed = pack(target_chars) + if max_chunks and max_chunks > 0 and len(packed) > max_chunks: + total = len("".join(parts)) + new_target = max(target_chars, int(total / max_chunks) + 1) + packed = pack(new_target) + return packed + + async def send_message_single_sentences() -> Tuple[bool, str]: + result = await self._execute_backend(backend, clean_text, voice, emotion) + if result.success: + # 生成更详细的动作记录,帮助 planner 避免重复执行 + text_preview = clean_text[:80] + "..." if len(clean_text) > 80 else clean_text + await self.store_action_info( + action_build_into_prompt=True, + action_prompt_display=f"已用语音回复:{text_preview}", + action_done=True + ) + else: + await self._send_error(f"语音合成失败: {result.message}") + + return result.success, result.message + async def send_message_with_splited_sentences() -> Tuple[bool, str]: + # 分段发送模式:将文本分割成句子,逐句发送语音 + if len(sentences) > 1: + logger.info(f"{self.log_prefix} 分段发送模式:共 {len(sentences)} 句") + + success_count = 0 + all_sentences_text = [] + + for i, sentence in enumerate(sentences): + if not sentence.strip(): + continue + + logger.debug(f"{self.log_prefix} 发送第 {i + 1}/{len(sentences)} 句: {sentence[:30]}...") + result = await self._execute_backend(backend, sentence, voice, emotion) + + if result.success: + success_count += 1 + all_sentences_text.append(sentence) + else: + logger.warning(f"{self.log_prefix} 第 {i + 1} 句发送失败: {result.message}") + + # 句子之间添加延迟 + if i < len(sentences) - 1 and split_delay > 0: + await asyncio.sleep(split_delay) + + # 记录动作信息 + if success_count > 0: + # 生成更详细的动作记录,帮助 planner 避免重复执行 + display_text = "".join(all_sentences_text) + text_preview = display_text[:80] + "..." if len(display_text) > 80 else display_text + await self.store_action_info( + action_build_into_prompt=True, + action_prompt_display=f"已用语音回复({success_count}段):{text_preview}", + action_done=True + ) + return True, f"成功发送 {success_count}/{len(sentences)} 条语音" + else: + await self._send_error("语音合成失败") + return False, "所有语音发送失败" + else: + # 只有一句,正常发送 + return await send_message_single_sentences() + + """执行TTS语音合成""" + try: + raw_text = self.action_data.get("text", "").strip() + voice = self.action_data.get("voice", "") + reason = self.action_data.get("reason", "") + emotion = self.action_data.get("emotion", "") + + use_replyer = self.get_config(ConfigKeys.GENERAL_USE_REPLYER_REWRITE, True) + + # 获取最终文本 + success, final_text = await self._get_final_text(raw_text, reason, use_replyer) + if not success or not final_text: + await self._send_error("无法生成语音内容") + return False, "文本为空" + + # 概率检查 + force_trigger = self._check_force_trigger(final_text) + if not force_trigger and not self._probability_check(final_text): + logger.info(f"{self.log_prefix} 概率检查未通过,使用文字回复") + await self.send_text(final_text) + text_preview = final_text[:80] + "..." if len(final_text) > 80 else final_text + await self.store_action_info( + action_build_into_prompt=True, + action_prompt_display=f"已用文字回复(语音概率未触发):{text_preview}", + action_done=True + ) + return True, "概率检查未通过,已发送文字回复" + + # 清理文本(移除特殊字符,替换网络用语) + # 注意:长度应该由LLM在生成时就遵守,这里只做字符清理 + clean_text = TTSTextUtils.clean_text(final_text, self.max_text_length) + if not clean_text: + await self._send_error("文本处理后为空") + return False, "文本处理后为空" + + # 如果清理后的文本仍然超过限制,说明LLM未遵守约束 + if len(clean_text) > self.max_text_length: + logger.warning( + f"{self.log_prefix} LLM生成的文本超过长度限制 " + f"({len(clean_text)} > {self.max_text_length}字符),降级为文字回复" + ) + await self.send_text(clean_text) + text_preview = clean_text[:80] + "..." if len(clean_text) > 80 else clean_text + await self.store_action_info( + action_build_into_prompt=True, + action_prompt_display=f"已用文字回复(内容过长):{text_preview}", + action_done=True + ) + return True, "内容超过语音长度限制,已改为文字回复" + + # 获取后端并执行 + backend = self._get_default_backend() + logger.info(f"{self.log_prefix} 使用配置的默认后端: {backend}") + + # 检查是否启用分段发送 + split_sentences = self.get_config(ConfigKeys.GENERAL_SPLIT_SENTENCES, True) + split_delay = self.get_config(ConfigKeys.GENERAL_SPLIT_DELAY, 0.3) + + sentences = None + + # 优先使用智能分割插件的分隔符 + if '|||SPLIT|||' in clean_text: + logger.info("found split marker from smart segmentation plugin") + sentences = [s.strip() for s in clean_text.split("|||SPLIT|||") if s.strip()] + # If the upstream splitter is too aggressive, pack back into fewer segments. + max_segments = int(self.get_config(ConfigKeys.GENERAL_SPLIT_MAX_SEGMENTS, 3) or 3) + chunk_chars = int(self.get_config(ConfigKeys.GENERAL_SPLIT_CHUNK_CHARS, 110) or 110) + if max_segments and max_segments > 0 and len(sentences) > max_segments: + sentences = _chunk_sentences(sentences, target_chars=chunk_chars, max_chunks=max_segments) + return await send_message_with_splited_sentences() + elif split_sentences: + # 自动分段:短文本不分段;长文本最多分成 N 段,避免刷屏式多段语音。 + min_total = int(self.get_config(ConfigKeys.GENERAL_SPLIT_MIN_TOTAL_CHARS, 120) or 120) + min_sentence = int(self.get_config(ConfigKeys.GENERAL_SPLIT_MIN_SENTENCE_CHARS, 6) or 6) + max_segments = int(self.get_config(ConfigKeys.GENERAL_SPLIT_MAX_SEGMENTS, 3) or 3) + chunk_chars = int(self.get_config(ConfigKeys.GENERAL_SPLIT_CHUNK_CHARS, 110) or 110) + + if len(clean_text) < min_total: + sentences = [clean_text] + else: + sentences = TTSTextUtils.split_sentences(clean_text, min_length=min_sentence) + if max_segments and max_segments > 0: + sentences = _chunk_sentences(sentences, target_chars=chunk_chars, max_chunks=max_segments) + return await send_message_with_splited_sentences() + else: + # 单句发送 + return await send_message_single_sentences() + + except Exception as e: + error_msg = str(e) + logger.error(f"{self.log_prefix} TTS语音合成出错: {error_msg}") + await self._send_error(f"语音合成出错: {error_msg}") + return False, error_msg + + +class UnifiedTTSCommand(BaseCommand, TTSExecutorMixin): + """统一TTS Command - 用户手动触发""" + + command_name = "unified_tts_command" + command_description = "将文本转换为语音,支持多种后端和音色" + command_pattern = r"^/(?:tts|voice|gsv2p|gptsovits|doubao|cosyvoice|comfyui|comfyui_voiceclone|comfyui_customvoice)\s+(?P.+?)(?:\s+-v\s+(?P\S+))?(?:\s+(?Pai_voice|gsv2p|gpt_sovits|doubao|cosyvoice|comfyui|comfyui_voiceclone|comfyui_customvoice))?$" + command_help = "将文本转换为语音。用法:/tts 你好世界 [-v 音色] [后端]" + command_examples = [ + "/tts 你好,世界!", + "/tts 今天天气不错 -v 小新", + "/gptsovits 你好世界 -v default", + "/cosyvoice 你好世界 -v 四川话", + "/tts 试试 -v 温柔妹妹 ai_voice", + "/gsv2p 你好世界", + "/doubao 你好世界 -v 开心" + ] + intercept_message = True + + async def _send_help(self): + """发送帮助信息""" + default_backend = self._get_default_backend() + + help_text = """【TTS语音合成插件帮助】 + +📝 基本语法: +/tts <文本> [-v <音色>] [后端] + + 🎯 快捷命令: + /tts <文本> 使用默认后端 + /voice <文本> 使用 AI Voice + /gsv2p <文本> 使用 GSV2P + /gptsovits <文本> 使用 GPT-SoVITS + /doubao <文本> 使用 豆包语音 + /cosyvoice <文本> 使用 CosyVoice + /comfyui <文本> 使用 ComfyUI(本地工作流) + /comfyui_voiceclone <文本> 使用 ComfyUI VoiceClone + /comfyui_customvoice <文本> 使用 ComfyUI CustomVoice + + 🔊 可用后端: + • ai_voice - MaiCore内置(仅群聊) + • gsv2p - 云端API,高质量 + • gpt_sovits - 本地服务,可定制 + • doubao - 火山引擎,支持情感 + • cosyvoice - 阿里云,支持方言 + • comfyui - 本地ComfyUI工作流(自动按 style.mode 选择) + • comfyui_voiceclone - 本地ComfyUI工作流(仅 VoiceClone) + • comfyui_customvoice - 本地ComfyUI工作流(仅 CustomVoice) + +🎭 音色/情感参数(-v): +• AI Voice: 小新、温柔妹妹、霸道总裁、妲己 等22种 +• GSV2P: 原神-中文-派蒙_ZH 等(见API文档) +• 豆包: 开心、生气、伤心、撒娇、严肃 等 +• CosyVoice: 广东话、四川话、东北话、开心、慢速 等 + +📌 示例: +/tts 你好世界 +/tts 今天真开心 -v 开心 +/gptsovits 这是本地语音合成 +/doubao 我生气了 -v 生气 +/cosyvoice 你好 -v 广东话 +/voice 测试一下 -v 温柔妹妹 + +⚙️ 当前默认后端:""" + default_backend + + await self.send_text(help_text) + + def _determine_backend(self, user_backend: str) -> Tuple[str, str]: + """ + 确定使用的后端 + + Returns: + (backend_name, source_description) + """ + # 1. 检查命令前缀 + raw_text = self.message.raw_message if self.message.raw_message else self.message.processed_plain_text + if raw_text: + # 命令前缀到后端的映射 + prefix_backend_map = { + "/gsv2p": "gsv2p", + "/gptsovits": "gpt_sovits", + "/doubao": "doubao", + "/cosyvoice": "cosyvoice", + "/voice": "ai_voice", + "/comfyui": "comfyui", + "/comfyui_voiceclone": "comfyui_voiceclone", + "/comfyui_customvoice": "comfyui_customvoice", + } + for prefix, backend in prefix_backend_map.items(): + if raw_text.startswith(prefix): + return backend, f"命令前缀 {prefix}" + + # 2. 检查命令参数 + if user_backend and user_backend in VALID_BACKENDS: + return user_backend, f"命令参数 {user_backend}" + + # 3. 使用配置文件默认值 + return self._get_default_backend(), "配置文件" + + async def execute(self) -> Tuple[bool, str, bool]: + """执行TTS命令""" + try: + text = self.matched_groups.get("text", "").strip() + voice = self.matched_groups.get("voice", "") + user_backend = self.matched_groups.get("backend", "") + + # 处理帮助命令 + if text.lower() == "help": + await self._send_help() + return True, "显示帮助信息", True + + if not text: + await self._send_error("请输入要转换为语音的文本内容") + return False, "缺少文本内容", True + + # 确定后端 + backend, backend_source = self._determine_backend(user_backend) + + # 清理文本 + max_length = self.get_config(ConfigKeys.GENERAL_MAX_TEXT_LENGTH, 500) + clean_text = TTSTextUtils.clean_text(text, max_length) + + if not clean_text: + await self._send_error("文本处理后为空") + return False, "文本处理后为空", True + + # 检查长度限制 + if len(clean_text) > max_length: + await self.send_text( + f"文本过长({len(clean_text)}字符)," + f"超过语音合成限制({max_length}字符)," + f"已改为文字发送。\n\n{clean_text}" + ) + return True, "文本过长,已改为文字发送", True + + logger.info(f"{self.log_prefix} 执行TTS命令 (后端: {backend} [来源: {backend_source}], 音色: {voice})") + + # 执行后端 + # 对于 CosyVoice 和豆包,voice 参数实际上是情感/方言 + if backend in ["cosyvoice", "doubao"]: + result = await self._execute_backend(backend, clean_text, voice="", emotion=voice) + else: + result = await self._execute_backend(backend, clean_text, voice) + + if not result.success: + await self._send_error(f"语音合成失败: {result.message}") + + return result.success, result.message, True + + except Exception as e: + logger.error(f"{self.log_prefix} TTS命令执行出错: {e}") + await self._send_error(f"语音合成出错: {e}") + return False, f"执行出错: {e}", True + + +class TTSInstructCommand(BaseCommand): + """生成 CustomVoice instruct(调试/预览用)""" + + command_name = "tts_instruct_command" + command_description = "根据待朗读文本生成 CustomVoice 的 instruct(情绪/语速/停顿)" + command_pattern = r"^/tts_instruct\\s+(?P.+?)$" + command_help = "用法:/tts_instruct <文本>" + command_examples = [ + "/tts_instruct 早上好,今天也要加油。", + "/tts_instruct えっ?本当にそうなの?", + ] + intercept_message = True + + async def execute(self) -> Tuple[bool, str, int]: + try: + text = (self.matched_groups.get("text") or "").strip() + if not text: + await self.send_text("请输入要生成 instruct 的文本") + return False, "缺少文本", 2 + + # Use the same logic as ComfyUI backend auto_instruct. + from .backends.comfyui import ComfyUIBackend + from .utils.text import TTSTextUtils + + detected = TTSTextUtils.detect_language(text) + chat_stream = getattr(self.message, "chat_stream", None) + chat_id = getattr(chat_stream, "stream_id", None) if chat_stream else None + + backend = ComfyUIBackend(self.get_config, log_prefix=self.log_prefix) + instruct = await backend._infer_instruct( + text=text, + detected_lang=detected, + chat_stream=chat_stream, + chat_id=chat_id, + style_name="__command__", + ) + + if not instruct: + await self.send_text("instruct 生成失败(可能未启用 comfyui.auto_instruct_enabled 或 LLM 不可用)") + return False, "instruct 生成失败", 2 + + await self.send_text(instruct) + return True, "instruct 已生成", 2 + except Exception as e: + await self.send_text(f"instruct 生成异常: {e}") + return False, str(e), 2 + + +@register_plugin +class UnifiedTTSPlugin(BasePlugin): + """统一TTS语音合成插件 - 支持多后端的文本转语音插件""" + + plugin_name = "tts_voice_plugin" + plugin_description = "统一TTS语音合成插件,支持AI Voice、GSV2P、GPT-SoVITS、豆包语音多种后端" + plugin_version = "3.2.3" + plugin_author = "靓仔" + enable_plugin = True + config_file_name = "config.toml" + dependencies = [] + python_dependencies = ["aiohttp"] + + config_section_descriptions = { + "plugin": "插件基本配置", + "general": "通用设置", + "components": "组件启用控制", + "probability": "概率控制配置", + "ai_voice": "AI Voice后端配置", + "gsv2p": "GSV2P后端配置", + "gpt_sovits": "GPT-SoVITS后端配置", + "doubao": "豆包语音后端配置", + "cosyvoice": "CosyVoice后端配置", + "comfyui": "ComfyUI工作流API后端配置" + } + + config_schema = { + "plugin": { + "enabled": ConfigField(type=bool, default=True, description="是否启用插件"), + "config_version": ConfigField(type=str, default="3.2.3", description="配置文件版本") + }, + "general": { + "default_backend": ConfigField( + type=str, default="cosyvoice", + description="默认TTS后端 (ai_voice/gsv2p/gpt_sovits/doubao/cosyvoice/comfyui/comfyui_voiceclone/comfyui_customvoice)" + ), + "timeout": ConfigField(type=int, default=60, description="请求超时时间(秒)"), + "max_text_length": ConfigField( + type=int, default=200, + description="最大文本长度(该限制会在调用LLM时注入到prompt中,让LLM直接生成符合长度的回复,而不是被动截断)" + ), + "use_replyer_rewrite": ConfigField( + type=bool, default=True, + description="是否使用replyer润色语音内容" + ), + "audio_output_dir": ConfigField( + type=str, default="", + description="音频文件输出目录(支持相对路径和绝对路径,留空使用项目根目录)" + ), + "use_base64_audio": ConfigField( + type=bool, default=True, + description="是否使用base64编码发送音频(备选方案)" + ), + "split_sentences": ConfigField( + type=bool, default=True, + description="是否分段发送语音(每句话单独发送一条语音,避免长语音播放问题)" + ), + "split_delay": ConfigField( + type=float, default=0.3, + description="分段发送时每条语音之间的延迟(秒)" + ), + "split_min_total_chars": ConfigField( + type=int, default=120, + description="自动分段启用阈值:文本长度小于该值时不分段(避免短句被切成多段)", + ), + "split_min_sentence_chars": ConfigField( + type=int, default=6, + description="句子最小长度:过短片段会合并到前一句(用于减少碎片段)", + ), + "split_max_segments": ConfigField( + type=int, default=3, + description="自动分段最大段数(避免刷屏式多段语音)。0 表示不限制。", + ), + "split_chunk_chars": ConfigField( + type=int, default=110, + description="自动分段打包目标长度(字符)。用于把多句合并成更少段。", + ), + "send_error_messages": ConfigField( + type=bool, default=True, + description="是否发送错误提示消息(关闭后语音合成失败时不会发送错误信息给用户)" + ) + }, + "components": { + "action_enabled": ConfigField(type=bool, default=True, description="是否启用Action组件"), + "command_enabled": ConfigField(type=bool, default=True, description="是否启用Command组件"), + "instruct_command_enabled": ConfigField(type=bool, default=True, description="是否启用instruct调试命令组件(/tts_instruct)") + }, + "probability": { + "enabled": ConfigField(type=bool, default=False, description="是否启用概率控制"), + "base_probability": ConfigField(type=float, default=1.0, description="基础触发概率"), + "keyword_force_trigger": ConfigField(type=bool, default=True, description="关键词强制触发"), + "force_keywords": ConfigField( + type=list, + default=["一定要用语音", "必须语音", "语音回复我", "务必用语音"], + description="强制触发关键词" + ) + }, + "ai_voice": { + "default_character": ConfigField( + type=str, + default="邻家小妹", + description="默认音色(可选:小新、猴哥、四郎、东北老妹儿、广西大表哥、妲己、霸道总裁、酥心御姐、说书先生、憨憨小弟、憨厚老哥、吕布、元气少女、文艺少女、磁性大叔、邻家小妹、低沉男声、傲娇少女、爹系男友、暖心姐姐、温柔妹妹、书香少女)" + ) + }, + "gsv2p": { + "api_url": ConfigField( + type=str, default="https://gsv2p.acgnai.top/v1/audio/speech", + description="GSV2P API地址" + ), + "api_token": ConfigField(type=str, default="", description="API认证Token"), + "default_voice": ConfigField(type=str, default="原神-中文-派蒙_ZH", description="默认音色"), + "timeout": ConfigField(type=int, default=120, description="API请求超时(秒)"), + "model": ConfigField(type=str, default="tts-v4", description="TTS模型"), + "response_format": ConfigField(type=str, default="wav", description="音频格式"), + "speed": ConfigField(type=float, default=1.0, description="语音速度") + }, + "gpt_sovits": { + "server": ConfigField( + type=str, default="http://127.0.0.1:9880", + description="GPT-SoVITS服务地址" + ), + "styles": ConfigField( + type=list, + default=[ + { + "name": "default", + "refer_wav": "", + "prompt_text": "", + "prompt_language": "zh", + "gpt_weights": "", + "sovits_weights": "" + } + ], + description="语音风格配置", + item_type="object", + item_fields={ + "name": {"type": "string", "label": "风格名称", "required": True}, + "refer_wav": {"type": "string", "label": "参考音频路径", "required": True}, + "prompt_text": {"type": "string", "label": "参考文本", "required": True}, + "prompt_language": {"type": "string", "label": "参考语言", "default": "zh"}, + "gpt_weights": {"type": "string", "label": "GPT模型权重路径(可选)", "required": False}, + "sovits_weights": {"type": "string", "label": "SoVITS模型权重路径(可选)", "required": False} + } + ) + }, + "doubao": { + "api_url": ConfigField( + type=str, + default="https://openspeech.bytedance.com/api/v3/tts/unidirectional", + description="豆包语音API地址" + ), + "app_id": ConfigField(type=str, default="", description="豆包APP ID"), + "access_key": ConfigField(type=str, default="", description="豆包Access Key"), + "resource_id": ConfigField(type=str, default="seed-tts-2.0", description="豆包Resource ID"), + "default_voice": ConfigField( + type=str, default="zh_female_vv_uranus_bigtts", + description="默认音色" + ), + "timeout": ConfigField(type=int, default=60, description="API请求超时(秒)"), + "audio_format": ConfigField(type=str, default="wav", description="音频格式"), + "sample_rate": ConfigField(type=int, default=24000, description="采样率"), + "bitrate": ConfigField(type=int, default=128000, description="比特率"), + "speed": ConfigField(type=float, default=None, description="语音速度(可选)"), + "volume": ConfigField(type=float, default=None, description="音量(可选)"), + "context_texts": ConfigField( + type=list, default=None, + description="上下文辅助文本(可选,仅豆包2.0模型)" + ) + }, + "cosyvoice": { + "gradio_url": ConfigField( + type=str, + default="https://funaudiollm-fun-cosyvoice3-0-5b.ms.show/", + description="Gradio API地址" + ), + "default_mode": ConfigField( + type=str, + default="3s极速复刻", + description="推理模式(3s极速复刻/自然语言控制)" + ), + "default_instruct": ConfigField( + type=str, + default="You are a helpful assistant. 请用广东话表达。<|endofprompt|>", + description="默认指令(用于自然语言控制模式)" + ), + "reference_audio": ConfigField( + type=str, + default="", + description="参考音频路径(用于3s极速复刻模式)" + ), + "prompt_text": ConfigField( + type=str, + default="", + description="提示文本(用于3s极速复刻模式)" + ), + "timeout": ConfigField(type=int, default=300, description="API请求超时(秒)"), + "audio_format": ConfigField(type=str, default="wav", description="音频格式") + }, + "comfyui": { + "server": ConfigField( + type=str, + default="http://127.0.0.1:8188", + description="ComfyUI 服务地址(示例: http://127.0.0.1:8188)", + ), + "input_dir": ConfigField( + type=str, + default="/Users/xenon/Downloads/seiun_tts/ComfyUI/ComfyUI/input", + description="ComfyUI input 目录(用于放参考音频,LoadAudio 会从这里读)", + ), + "timeout": ConfigField(type=int, default=120, description="ComfyUI 请求超时(秒)"), + "audio_quality": ConfigField( + type=str, + default="128k", + description="输出 MP3 质量(SaveAudioMP3 quality: V0/128k/320k)", + ), + "mlx_python": ConfigField( + type=str, + default="/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/.venv/bin/python", + description="MLX Qwen3-TTS venv python 路径(用于 ComfyUI-MLX 节点子进程)", + ), + "mlx_cli": ConfigField( + type=str, + default="/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/mlx_voice_clone_cli.py", + description="mlx_voice_clone_cli.py 路径", + ), + "default_style": ConfigField(type=str, default="default", description="默认风格名称"), + "voiceclone_default_style": ConfigField( + type=str, + default="", + description="VoiceClone 专用默认风格名称(用于 comfyui_voiceclone 后端;留空则回退到 default_style)", + ), + "customvoice_default_style": ConfigField( + type=str, + default="", + description="CustomVoice 专用默认风格名称(用于 comfyui_customvoice 后端;留空则回退到 default_style)", + ), + "auto_instruct_enabled": ConfigField( + type=bool, + default=False, + description="是否启用 CustomVoice instruct 自动推断(使用 MaiBot 的 LLM 接口)", + ), + "auto_instruct_max_chars": ConfigField( + type=int, + default=120, + description="自动推断 instruct 的最大长度(字符)。建议 80-160,太短会导致情绪/表演提示被截断。", + ), + "auto_instruct_prompt": ConfigField( + type=str, + default="", + description="自定义 instruct 推断 prompt(留空使用内置模板)", + ), + "auto_instruct_base_tone": ConfigField( + type=str, + default="", + description="自动推断 instruct 时固定附加的基调描述(会作为 `基调=...;` 前缀插入;会自动清洗为单行,且不会包含 `;`/`=`)", + ), + "pause_linebreak": ConfigField(type=float, default=0.0, description="换行停顿(秒)"), + "period_pause": ConfigField(type=float, default=0.0, description="句号停顿(秒)"), + "comma_pause": ConfigField(type=float, default=0.0, description="逗号停顿(秒)"), + "question_pause": ConfigField(type=float, default=0.0, description="问号停顿(秒)"), + "hyphen_pause": ConfigField(type=float, default=0.0, description="连字符停顿(秒)"), + "styles": ConfigField( + type=list, + default=[ + { + "name": "default", + "refer_wav": "", + "prompt_text": "", + "language": "", + "model_choice": "1.7B", + "precision": "bf16", + "seed": 0, + "max_new_tokens": 2048, + "top_p": 0.8, + "top_k": 20, + "temperature": 1.0, + "repetition_penalty": 1.05, + } + ], + description="ComfyUI VoiceClone 风格配置(参考音频+逐字稿)", + item_type="object", + item_fields={ + "name": {"type": "string", "label": "风格名称", "required": True}, + "mode": {"type": "string", "label": "模式(voice_clone/custom_voice)", "required": False}, + "refer_wav": {"type": "string", "label": "参考音频路径", "required": True}, + "prompt_text": {"type": "string", "label": "参考文本(逐字稿)", "required": True}, + "language": {"type": "string", "label": "语言(可选: Auto/Chinese/English/...) ", "required": False}, + "model_choice": {"type": "string", "label": "模型(0.6B/1.7B)", "required": False}, + "precision": {"type": "string", "label": "精度(bf16/fp32)", "required": False}, + "model_path": {"type": "string", "label": "CustomVoice模型路径", "required": False}, + "speaker": {"type": "string", "label": "CustomVoice说话人", "required": False}, + "instruct": {"type": "string", "label": "CustomVoice指令(或__AUTO__)", "required": False}, + "auto_instruct": {"type": "boolean", "label": "按style启用auto_instruct", "required": False}, + "speed": {"type": "number", "label": "speed", "required": False}, + "seed": {"type": "number", "label": "seed", "required": False}, + "max_new_tokens": {"type": "number", "label": "max_new_tokens", "required": False}, + "top_p": {"type": "number", "label": "top_p", "required": False}, + "top_k": {"type": "number", "label": "top_k", "required": False}, + "temperature": {"type": "number", "label": "temperature", "required": False}, + "repetition_penalty": {"type": "number", "label": "repetition_penalty", "required": False}, + }, + ), + } + } + + def get_plugin_components(self) -> List[Tuple[ComponentInfo, Type]]: + """返回插件组件列表""" + components = [] + + try: + action_enabled = self.get_config(ConfigKeys.COMPONENTS_ACTION_ENABLED, True) + command_enabled = self.get_config(ConfigKeys.COMPONENTS_COMMAND_ENABLED, True) + instruct_enabled = self.get_config(ConfigKeys.COMPONENTS_INSTRUCT_COMMAND_ENABLED, True) + except AttributeError: + action_enabled = True + command_enabled = True + instruct_enabled = True + + if action_enabled: + components.append((UnifiedTTSAction.get_action_info(), UnifiedTTSAction)) + + if command_enabled: + components.append((UnifiedTTSCommand.get_command_info(), UnifiedTTSCommand)) + + if instruct_enabled: + components.append((TTSInstructCommand.get_command_info(), TTSInstructCommand)) + + return components diff --git a/test.wav b/test.wav new file mode 100644 index 00000000..37550701 Binary files /dev/null and b/test.wav differ diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 00000000..1c0e5cd4 --- /dev/null +++ b/utils/__init__.py @@ -0,0 +1,12 @@ +""" +TTS工具模块 +""" + +import sys +sys.dont_write_bytecode = True + +from .text import TTSTextUtils +from .session import TTSSessionManager +from .file import TTSFileManager + +__all__ = ["TTSTextUtils", "TTSSessionManager", "TTSFileManager"] diff --git a/utils/file.py b/utils/file.py new file mode 100644 index 00000000..c56469a6 --- /dev/null +++ b/utils/file.py @@ -0,0 +1,280 @@ +""" +文件操作工具类 +提供异步文件操作、临时文件管理等功能 +""" + +import os +import uuid +import tempfile +import asyncio +import base64 +from typing import Optional +from src.common.logger import get_logger + +logger = get_logger("tts_file_manager") + +# 音频数据最小有效大小(字节) +MIN_AUDIO_SIZE = 100 + + +class TTSFileManager: + """ + TTS文件管理器 + + 提供: + - 临时文件创建(避免并发冲突) + - 异步文件写入 + - 自动清理 + - 相对路径和绝对路径支持 + """ + + # 临时文件目录(兼容旧代码) + _temp_dir: Optional[str] = None + + # 项目根目录(用于解析相对路径) + _project_root: Optional[str] = None + + @classmethod + def set_project_root(cls, root_path: str): + """设置项目根目录""" + if os.path.isdir(root_path): + cls._project_root = root_path + logger.debug(f"设置项目根目录: {root_path}") + else: + logger.warning(f"项目根目录不存在: {root_path}") + + @classmethod + def get_project_root(cls) -> str: + """获取项目根目录""" + if cls._project_root is None: + # 尝试从当前文件位置推断项目根目录 + current_file = os.path.abspath(__file__) + # 假设结构是: project_root/plugins/tts_voice_plugin/utils/file.py + cls._project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_file)))) + logger.debug(f"自动推断项目根目录: {cls._project_root}") + return cls._project_root + + @classmethod + def resolve_path(cls, path: str) -> str: + """ + 解析路径(支持相对路径和绝对路径) + + Args: + path: 路径字符串 + + Returns: + 解析后的绝对路径 + """ + if os.path.isabs(path): + # 已经是绝对路径 + return path + else: + # 相对路径,相对于项目根目录 + return os.path.join(cls.get_project_root(), path) + + @classmethod + def ensure_dir(cls, dir_path: str) -> bool: + """ + 确保目录存在,不存在则创建 + + Args: + dir_path: 目录路径 + + Returns: + 是否成功 + """ + try: + os.makedirs(dir_path, exist_ok=True) + return True + except Exception as e: + logger.error(f"创建目录失败: {dir_path}, 错误: {e}") + return False + + @classmethod + def get_temp_dir(cls) -> str: + """ + 获取临时文件目录(已废弃,保留兼容性) + + Returns: + 临时目录路径 + """ + if cls._temp_dir is None: + cls._temp_dir = tempfile.gettempdir() + return cls._temp_dir + + @classmethod + def set_temp_dir(cls, path: str): + """ + 设置临时文件目录(已废弃,保留兼容性) + + Args: + path: 目录路径 + """ + if os.path.isdir(path): + cls._temp_dir = path + else: + raise ValueError(f"目录不存在: {path}") + + @classmethod + def generate_temp_path(cls, prefix: str = "tts", suffix: str = ".mp3", output_dir: str = "") -> str: + """ + 生成唯一的临时文件路径 + + Args: + prefix: 文件名前缀 + suffix: 文件扩展名 + output_dir: 输出目录(支持相对路径和绝对路径,留空使用项目根目录) + + Returns: + 临时文件的绝对路径 + """ + # 确定输出目录 + if not output_dir: + # 默认使用项目根目录 + resolved_dir = cls.get_project_root() + else: + # 解析用户配置的路径 + resolved_dir = cls.resolve_path(output_dir) + # 确保目录存在 + if not cls.ensure_dir(resolved_dir): + # 如果创建失败,降级到项目根目录 + logger.warning(f"无法创建输出目录 {resolved_dir},使用项目根目录") + resolved_dir = cls.get_project_root() + + # 生成唯一文件名 + unique_id = uuid.uuid4().hex[:12] + filename = f"{prefix}_{unique_id}{suffix}" + return os.path.join(resolved_dir, filename) + + @classmethod + async def write_audio_async(cls, path: str, data: bytes) -> bool: + """ + 异步写入音频数据到文件 + + Args: + path: 文件路径 + data: 音频二进制数据 + + Returns: + 是否写入成功 + """ + try: + # 使用线程池执行同步文件写入,避免阻塞事件循环 + loop = asyncio.get_event_loop() + await loop.run_in_executor(None, cls._write_file_sync, path, data) + logger.debug(f"音频文件写入成功: {path} ({len(data)} bytes)") + return True + except IOError as e: + logger.error(f"写入音频文件失败: {path}, 错误: {e}") + return False + except Exception as e: + logger.error(f"写入音频文件时发生未知错误: {path}, 错误: {e}") + return False + + @staticmethod + def _write_file_sync(path: str, data: bytes): + """同步写入文件(内部方法)""" + with open(path, "wb") as f: + f.write(data) + + @classmethod + def write_audio_sync(cls, path: str, data: bytes) -> bool: + """ + 同步写入音频数据到文件 + + Args: + path: 文件路径 + data: 音频二进制数据 + + Returns: + 是否写入成功 + """ + try: + cls._write_file_sync(path, data) + logger.debug(f"音频文件写入成功: {path} ({len(data)} bytes)") + return True + except IOError as e: + logger.error(f"写入音频文件失败: {path}, 错误: {e}") + return False + except Exception as e: + logger.error(f"写入音频文件时发生未知错误: {path}, 错误: {e}") + return False + + @classmethod + def cleanup_file(cls, path: str, silent: bool = True) -> bool: + """ + 清理临时文件 + + Args: + path: 文件路径 + silent: 是否静默处理错误 + + Returns: + 是否清理成功 + """ + try: + if path and os.path.exists(path): + os.remove(path) + logger.debug(f"临时文件已清理: {path}") + return True + return False + except Exception as e: + if not silent: + logger.warning(f"清理临时文件失败: {path}, 错误: {e}") + return False + + @classmethod + async def cleanup_file_async(cls, path: str, delay: float = 0) -> bool: + """ + 异步清理临时文件(可延迟) + + Args: + path: 文件路径 + delay: 延迟秒数 + + Returns: + 是否清理成功 + """ + if delay > 0: + await asyncio.sleep(delay) + loop = asyncio.get_event_loop() + return await loop.run_in_executor(None, cls.cleanup_file, path, True) + + @classmethod + def validate_audio_data(cls, data: bytes, min_size: int = None) -> tuple: + """ + 验证音频数据有效性 + + Args: + data: 音频二进制数据 + min_size: 最小有效大小 + + Returns: + (is_valid, error_message) + """ + if data is None: + return False, "音频数据为空" + + min_size = min_size or MIN_AUDIO_SIZE + + if len(data) < min_size: + return False, f"音频数据过小({len(data)}字节 < {min_size}字节)" + + return True, "" + + @classmethod + def audio_to_base64(cls, data: bytes) -> str: + """ + 将音频数据转换为base64字符串 + + Args: + data: 音频二进制数据 + + Returns: + base64编码的字符串 + """ + try: + return base64.b64encode(data).decode('utf-8') + except Exception as e: + logger.error(f"音频数据转base64失败: {e}") + return "" diff --git a/utils/session.py b/utils/session.py new file mode 100644 index 00000000..8535b04c --- /dev/null +++ b/utils/session.py @@ -0,0 +1,186 @@ +""" +HTTP Session 管理器 +提供连接池复用,避免每次请求创建新连接 +""" + +import asyncio +import aiohttp +from typing import Optional, Dict, Any +from contextlib import asynccontextmanager +from src.common.logger import get_logger + +logger = get_logger("tts_session_manager") + + +class TTSSessionManager: + """ + TTS HTTP Session 管理器 + + 提供: + - 连接池复用 + - 自动超时管理 + - 优雅关闭 + """ + + _instance: Optional["TTSSessionManager"] = None + _lock = asyncio.Lock() + + def __init__(self): + self._sessions: Dict[str, aiohttp.ClientSession] = {} + self._default_timeout = 60 + + @classmethod + async def get_instance(cls) -> "TTSSessionManager": + """获取单例实例""" + if cls._instance is None: + async with cls._lock: + if cls._instance is None: + cls._instance = cls() + return cls._instance + + async def get_session( + self, + backend_name: str = "default", + timeout: int = None + ) -> aiohttp.ClientSession: + """ + 获取或创建 HTTP Session + + Args: + backend_name: 后端名称,用于区分不同的session + timeout: 超时时间(秒) + + Returns: + aiohttp.ClientSession 实例 + """ + if backend_name not in self._sessions or self._sessions[backend_name].closed: + timeout_val = timeout or self._default_timeout + connector = aiohttp.TCPConnector( + limit=10, # 每个主机最大连接数 + limit_per_host=5, + ttl_dns_cache=300, # DNS缓存5分钟 + force_close=True, # 禁用连接复用,修复GSV2P等API的兼容性问题 + ) + self._sessions[backend_name] = aiohttp.ClientSession( + connector=connector, + timeout=aiohttp.ClientTimeout(total=timeout_val) + ) + logger.debug(f"创建新的HTTP Session: {backend_name}") + + return self._sessions[backend_name] + + async def close_session(self, backend_name: str = None): + """ + 关闭指定或所有 Session + + Args: + backend_name: 后端名称,为None时关闭所有 + """ + if backend_name: + if backend_name in self._sessions: + await self._sessions[backend_name].close() + del self._sessions[backend_name] + logger.debug(f"关闭HTTP Session: {backend_name}") + else: + for name, session in self._sessions.items(): + if not session.closed: + await session.close() + logger.debug(f"关闭HTTP Session: {name}") + self._sessions.clear() + + @asynccontextmanager + async def post( + self, + url: str, + json: Dict[str, Any] = None, + headers: Dict[str, str] = None, + data: Any = None, + backend_name: str = "default", + timeout: int = None + ): + """ + 发送POST请求(异步上下文管理器) + + Args: + url: 请求URL + json: JSON请求体 + headers: 请求头 + data: 表单数据 + backend_name: 后端名称 + timeout: 超时时间 + + Yields: + aiohttp.ClientResponse + + Usage: + async with session_manager.post(url, json=data) as response: + ... + """ + session = await self.get_session(backend_name, timeout) + + # 如果指定了不同的超时时间,创建新的超时对象 + req_timeout = None + if timeout: + req_timeout = aiohttp.ClientTimeout(total=timeout) + + response = await session.post( + url, + json=json, + headers=headers, + data=data, + timeout=req_timeout + ) + try: + yield response + finally: + response.release() + + @asynccontextmanager + async def get( + self, + url: str, + headers: Dict[str, str] = None, + params: Dict[str, Any] = None, + backend_name: str = "default", + timeout: int = None + ): + """ + 发送GET请求(异步上下文管理器) + + Args: + url: 请求URL + headers: 请求头 + params: URL参数 + backend_name: 后端名称 + timeout: 超时时间 + + Yields: + aiohttp.ClientResponse + + Usage: + async with session_manager.get(url) as response: + ... + """ + session = await self.get_session(backend_name, timeout) + + # 如果指定了不同的超时时间,创建新的超时对象 + req_timeout = None + if timeout: + req_timeout = aiohttp.ClientTimeout(total=timeout) + + response = await session.get( + url, + headers=headers, + params=params, + timeout=req_timeout + ) + try: + yield response + finally: + response.release() + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + await self.close_session() diff --git a/utils/text.py b/utils/text.py new file mode 100644 index 00000000..93524c08 --- /dev/null +++ b/utils/text.py @@ -0,0 +1,181 @@ +""" +文本处理工具类 +""" + +import re +from typing import Optional, List + + +class TTSTextUtils: + """TTS文本处理工具类""" + + # 网络用语替换映射 + NETWORK_SLANG_MAP = { + 'www': '哈哈哈', + 'hhh': '哈哈', + '233': '哈哈', + '666': '厉害', + '88': '拜拜', + '...': '。', + '……': '。' + } + + # 需要移除的特殊字符正则 + SPECIAL_CHAR_PATTERN = re.compile( + r'[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ffa-zA-Z0-9\s,。!?、;:()【】"\'.,!?;:()\[\]`-]' + ) + + # 语言检测正则 + CHINESE_PATTERN = re.compile(r'[\u4e00-\u9fff]') + ENGLISH_PATTERN = re.compile(r'[a-zA-Z]') + JAPANESE_PATTERN = re.compile(r'[\u3040-\u309f\u30a0-\u30ff]') + + @classmethod + def clean_text(cls, text: str, max_length: int = 500) -> str: + """ + 清理文本,移除特殊字符,替换网络用语 + + Args: + text: 原始文本 + max_length: 最大长度限制(此参数已不用于硬截断,仅用于参考) + + Returns: + 清理后的文本(不会硬截断,保留完整内容以便上层决策) + """ + if not text: + return "" + + # 注释掉文本清理功能,保留原始格式 + # 移除不支持的特殊字符 + # text = cls.SPECIAL_CHAR_PATTERN.sub('', text) + + # 替换常见网络用语 + # for old, new in cls.NETWORK_SLANG_MAP.items(): + # text = text.replace(old, new) + + return text.strip() + + @classmethod + def detect_language(cls, text: str) -> str: + """ + 检测文本语言 + + Args: + text: 待检测文本 + + Returns: + 语言代码 (zh/ja/en) + """ + if not text: + return "zh" + + chinese_chars = len(cls.CHINESE_PATTERN.findall(text)) + english_chars = len(cls.ENGLISH_PATTERN.findall(text)) + japanese_chars = len(cls.JAPANESE_PATTERN.findall(text)) + total_chars = chinese_chars + english_chars + japanese_chars + + if total_chars == 0: + return "zh" + + chinese_ratio = chinese_chars / total_chars + japanese_ratio = japanese_chars / total_chars + english_ratio = english_chars / total_chars + + if chinese_ratio > 0.3: + return "zh" + elif japanese_ratio > 0.3: + return "ja" + elif english_ratio > 0.8: + return "en" + else: + return "zh" + + @classmethod + def resolve_voice_alias( + cls, + voice: Optional[str], + alias_map: dict, + default: str, + prefix: str = "" + ) -> str: + """ + 解析音色别名 + + Args: + voice: 用户指定的音色 + alias_map: 别名映射表 + default: 默认音色 + prefix: 内部音色ID前缀(如 "lucy-voice-") + + Returns: + 解析后的音色ID + """ + if not voice: + voice = default + + # 如果已经是内部ID格式,直接返回 + if prefix and voice.startswith(prefix): + return voice + + # 尝试从别名映射查找 + if voice in alias_map: + return alias_map[voice] + + # 尝试使用默认值的别名 + if default in alias_map: + return alias_map[default] + + return default + + @classmethod + def split_sentences(cls, text: str, min_length: int = 2) -> List[str]: + """ + 将文本分割成句子 + + Args: + text: 待分割文本 + min_length: 最小句子长度,过短的句子会合并到前一句 + + Returns: + 句子列表 + """ + if not text: + return [] + + # 使用中英文标点分割 + # 保留分隔符以便后续处理 + pattern = r'([。!?!?;;])' + parts = re.split(pattern, text) + + sentences = [] + current = "" + + for i, part in enumerate(parts): + if not part: + continue + + # 如果是标点符号,附加到当前句子 + if re.match(pattern, part): + current += part + else: + # 如果当前句子不为空,先保存 + if current.strip(): + sentences.append(current.strip()) + current = part + + # 处理最后一段 + if current.strip(): + sentences.append(current.strip()) + + # 合并过短的句子 + if min_length > 0 and len(sentences) > 1: + merged = [] + for sent in sentences: + if merged and len(sent) < min_length: + # 合并到前一句 + merged[-1] += sent + else: + merged.append(sent) + sentences = merged + + return sentences